library(ggplot2)
library(devtools)
library(phyloseq)
library(picante)
library(tidyr)
library(forcats) 
library(dplyr)
library(tibble)
library(cowplot)
library(picante)    # Will also include ape and vegan 
library(car)        # For residual analysis
library(sandwich)   # for vcovHC function in post-hoc test
library(MASS)       # For studres in plot_residuals function
library(caret)      # For cross validation
library(pander)     # For pretty tables 
library(glmnet)     # For lasso and ridge regressions 
source("code/Muskegon_functions.R")
source("code/set_colors.R") 

Load data

# Loads a phyloseq object named otu_merged_musk_pruned)
load("data/surface_PAFL_otu_pruned_raw.RData")
# The name of the phyloseq object is: 
surface_PAFL_otu_pruned_raw 
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7806 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 44 sample variables ]
## tax_table()   Taxonomy Table:    [ 7806 taxa by 8 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 7806 tips and 7804 internal nodes ]
# Remove doubletons!
surface_PAFL_otu_pruned_rm2 <- prune_taxa(taxa_sums(surface_PAFL_otu_pruned_raw) > 2, surface_PAFL_otu_pruned_raw) 
surface_PAFL_otu_pruned_rm2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2979 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 44 sample variables ]
## tax_table()   Taxonomy Table:    [ 2979 taxa by 8 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2979 tips and 2977 internal nodes ]
# Remove tree for computational efficiency 
surface_PAFL_otu_pruned_notree_rm2 <- phyloseq(tax_table(surface_PAFL_otu_pruned_rm2), otu_table(surface_PAFL_otu_pruned_rm2), sample_data(surface_PAFL_otu_pruned_rm2)) 


# Gather the metadata in a dataframe to play with 
metadata <- data.frame(sample_data(surface_PAFL_otu_pruned_notree_rm2)) %>%
      mutate(fraction = factor(fraction, levels = c("WholePart","WholeFree")),
         lakesite = factor(lakesite,  levels = c("MOT", "MDP", "MBR", "MIN")),
         fraction = fct_recode(fraction, Particle = "WholePart", Free = "WholeFree"),
         lakesite = fct_recode(lakesite, Outlet = "MOT", Deep = "MDP", Bear = "MBR", River = "MIN"))
row.names(metadata) <- metadata$norep_filter_name

# Replace the sample data 
sample_data(surface_PAFL_otu_pruned_notree_rm2) <- metadata

Production over Station and Season

ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free", "Particle") &
                       norep_filter_name != "MOTEJ515"), 
       aes(x = lakesite, y = log10(as.numeric(fraction_bac_abund)), fill = fraction)) + 
  geom_bar(stat = "identity", color = "black",  position=position_dodge()) + 
  facet_grid(. ~ season) + 
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Lake Station") +  ylab("Log10(Bacterial Cells/mL)") +
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        legend.position = "bottom", legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.5), "cm")) #top, right, bottom, and left margins)

prod1 <- ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free", "Particle")), 
       aes(x = lakesite, y = frac_bacprod, fill = fraction)) + 
  geom_bar(stat = "identity", color = "black",  position=position_dodge()) + 
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod), 
                position = position_dodge(width = 0.75), width = 0.25) +
  facet_grid(. ~ season) + 
  scale_y_continuous(expand = c(0,0), limits = c(0, 80), 
                     breaks = seq(from = 0, to = 80, by = 15)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Lake Station") + ylab("Community Production\n(ug C/L/Hr)") + 
  theme(axis.text.x = element_blank(), #element_text(angle = 30, vjust = 1, hjust = 1),
        axis.title.x = element_blank(),
        legend.position = c(0.85, 0.9), legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.25), "cm")) #top, right, bottom, and left margins)


prod2 <- ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free", "Particle")), 
       aes(x = lakesite, y = fracprod_per_cell_noinf, fill = fraction)) + 
  geom_bar(stat = "identity", color = "black",  position=position_dodge()) + 
  facet_grid(. ~ season) +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Lake Station") + ylab("Cellular Production \n(ug C/cell/Hr)") + 
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        legend.position = c(0.85, 0.9), legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.5), "cm")) #top, right, bottom, and left margins)


########## LONG FORMAT OF TOTAL PRODUCTIVITY
ggplot(dplyr::filter(metadata, year == "2015" & fraction == "Free"), 
       aes(x = lakesite, y = tot_bacprod)) + 
  geom_bar(stat = "identity", color = "black", fill = "grey", position=position_dodge()) + 
  geom_errorbar(aes(ymin = tot_bacprod - SD_tot_bacprod, ymax = tot_bacprod + SD_tot_bacprod),width = 0.25) +
  facet_grid(season ~., scales = "free") + 
  scale_y_continuous(expand = c(0,0), limits = c(0, 80), 
                     breaks = seq(from = 0, to = 80, by = 15)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Lake Station") + ylab("Community Production\n(ug C/L/Hr)") + 
  theme(legend.position = c(0.85, 0.9), legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.25), "cm")) #top, right, bottom, and left margins)
## Warning: Removed 1 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_errorbar).

########## LONG FORMAT OF COMMUNITY PRODUCTIVITY
community_prod_plot_long <- 
  ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free", "Particle")), 
       aes(x = lakesite, y = frac_bacprod, fill = fraction)) + 
  geom_bar(stat = "identity", color = "black",  position=position_dodge()) + 
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod), 
                position = position_dodge(width = 0.75), width = 0.25) +
  facet_grid(season ~., scales = "free") + 
  scale_y_continuous(expand = c(0,0), limits = c(0, 80), 
                     breaks = seq(from = 0, to = 80, by = 15)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Lake Station") + ylab("Community Production\n(ug C/L/Hr)") + 
  theme(legend.position = "bottom", legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.25), "cm")) #top, right, bottom, and left margins)


########## LONG FORMAT OF CELLULAR PRODUCTIVITY
percapita_prod_plot_long <- 
  ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free", "Particle")), 
       aes(x = lakesite, y = fracprod_per_cell_noinf, fill = fraction)) + 
  geom_bar(stat = "identity", color = "black",  position=position_dodge()) + 
  facet_grid(season ~ ., scales = "free_x") +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Lake Station") + ylab("Cellular Production \n(ug C/cell/Hr)") + 
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        legend.position = "bottom", legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.5), "cm")) #top, right, bottom, and left margins)

plot_grid(community_prod_plot_long, percapita_prod_plot_long, 
          labels = c("A", "B"),
          nrow = 1, ncol = 2,
          align = "v")
## Warning: Removed 1 rows containing missing values (geom_bar).

plot_grid(prod1, prod2, labels = c("A", "B"),
          rel_heights = c(1, 1.3),
          nrow = 2, ncol = 1,
          align = "v")
## Warning: Removed 1 rows containing missing values (geom_bar).

### BY STATION 
prod_by_station <- ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free")), 
       aes(x = lakesite, y = tot_bacprod)) + 
  geom_boxplot(fill = "grey") + geom_point(size = 3.5, position = position_jitter()) +
  scale_y_continuous(expand = c(0,0), limits = c(0,100)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Lake Station") + ylab("Total Production (ug C/cell/Hr)") + 
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        legend.position = c(0.85, 0.9), legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.5), "cm")) #top, right, bottom, and left margins)

prod_by_season <- ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free")), 
       aes(x = season, y = tot_bacprod)) + 
  geom_boxplot(fill = "grey") + geom_point(size = 3.5, position = position_jitter()) +
  scale_y_continuous(expand = c(0,0), limits = c(0,100)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Season") + 
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        axis.title.y = element_blank(),
        legend.position = c(0.85, 0.9), legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.5), "cm")) #top, right, bottom, and left margins)

plot_grid(prod_by_station, prod_by_season, labels = c("A", "B"),
          rel_widths = c(1, 0.75),
          nrow = 1, ncol = 2,
          align = "h")

scaled_surface_PAFLA_pruned_rm2 <- scale_reads(surface_PAFL_otu_pruned_notree_rm2)

set.seed(777)

# Calculate the SOREN distance
soren_whole_dist <- ordinate(
  physeq = scaled_surface_PAFLA_pruned_rm2,
  method = "PCoA",
  distance = "bray",
  binary = TRUE) # Make it presence/absence
 
p1 <- plot_ordination(
  physeq = scaled_surface_PAFLA_pruned_rm2,
  ordination = soren_whole_dist,
  color = "fraction",
  shape = "lakesite",
  title = "Sørensen") +
  geom_point(size=5, alpha = 0.7, aes(fill =fraction,  color = fraction)) +
  scale_colour_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = c(21, 22, 23, 24)) +
  theme(legend.position = "none")


# Calculate the BRAY-CURTIS distance
bray_whole_dist <- ordinate(
  physeq = scaled_surface_PAFLA_pruned_rm2,
  method = "PCoA",
  distance = "bray",
  binary = FALSE) # Make it Abundance weighted 
 
p2 <- plot_ordination(
  physeq = scaled_surface_PAFLA_pruned_rm2,
  ordination = bray_whole_dist ,
  color = "fraction",
  shape = "lakesite",
  title = "Bray-Curtis") +
  geom_point(size=5, alpha = 0.7, aes(fill =fraction,  color = fraction)) +
  scale_colour_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = c(21, 22, 23, 24)) +
  theme(legend.position = "right", legend.title = element_blank())

plot_grid(p1, p2, labels = c("A", "B"), align = "h", ncol = 2,
          rel_widths = c(0.8, 1.2))

# Calculate bray curtis distance matrix
musk_bray <- phyloseq::distance(scaled_surface_PAFLA_pruned_rm2, method = "bray")

# make a data frame from the sample_data
sampledf <- data.frame(sample_data(scaled_surface_PAFLA_pruned_rm2))

# Adonis test
adonis(musk_bray ~ fraction + season , data = sampledf)
## 
## Call:
## adonis(formula = musk_bray ~ fraction + season, data = sampledf) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## fraction   1    1.3903 1.39032 16.2279 0.30094  0.001 ***
## season     2    1.5161 0.75804  8.8479 0.32816  0.001 ***
## Residuals 20    1.7135 0.08567         0.37089           
## Total     23    4.6199                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Environmental Variables Pairs

# Subset the environmental data only
environmental_data <- metadata %>%
  dplyr::select(Temp_C:DO_percent, -BGA_cellspermL, -SRP_ugperL, fraction, norep_filter_name) %>%
  dplyr::filter(fraction == "Free") %>%
  dplyr::select(-fraction) %>%
  tibble::column_to_rownames(var = "norep_filter_name") %>%
  as.matrix()

pairs(environmental_data)

Euclidian PCA with Environmental Variables

# Scale the data so their variances are the same!
scaled_enviro <- scale(environmental_data)

# Sanity Checks: check that we get mean of 0 and sd of 1
apply(scaled_enviro, 2, mean)
##         Temp_C SpCond_uSpercm     TDS_mgperL             pH         ORP_mV Chl_Lab_ugperL      Cl_mgperL     SO4_mgperL     NO3_mgperL     NH3_mgperL     TKN_mgperL      TP_ugperL     Alk_mgperL      DO_mgperL     DO_percent 
##  -3.151262e-16  -1.170392e-15  -8.881784e-16  -2.063155e-15   4.614635e-18   2.081668e-17  -2.937375e-16   3.932311e-16   1.237526e-16  -1.155579e-17   7.400583e-17  -5.090894e-17   1.454855e-15   7.401939e-17  -1.331111e-15
apply(scaled_enviro, 2, sd)
##         Temp_C SpCond_uSpercm     TDS_mgperL             pH         ORP_mV Chl_Lab_ugperL      Cl_mgperL     SO4_mgperL     NO3_mgperL     NH3_mgperL     TKN_mgperL      TP_ugperL     Alk_mgperL      DO_mgperL     DO_percent 
##              1              1              1              1              1              1              1              1              1              1              1              1              1              1              1
# Run a principal component analysis via the vegan package
pca_environ <- rda(scaled_enviro) # 

par(mar = c(5,5,2,5))
plot(summary(pca_environ)$cont$importance[2,]*100, 
     xlab = "PCA Axis", 
     ylab = "Variation Explained Per Axis",
     ylim = c(0, 105),
     col = "cornflowerblue",
     cex =2,
     pch = 16)
par(new = T)
plot(summary(pca_environ)$cont$importance[3,]*100,
       cex = 2,
       pch = 17,
       ylim = c(0, 105),
       col = "firebrick3",
       axes=F, 
       xlab=NA, 
       ylab=NA)
axis(side = 4)
mtext(side = 4, line = 3, "Total Accumulated Variation")
legend("right",
       legend=c("Per Axis", "Total"),
       pch=c(16, 17), col=c("cornflowerblue", "firebrick3"))

Linear Models with Environmental Variables

pca_scores_df <- summary(pca_environ)$sites %>%
  as.data.frame() %>%
  tibble::rownames_to_column(var = "norep_filter_name") %>%
  mutate(norep_water_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  dplyr::select(-norep_filter_name)

metadata_pca <- metadata %>%
  mutate(norep_water_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  left_join(pca_scores_df, by = "norep_water_name")



#####  SINGLE REGRESSION with PC1
## Free living
comm_lm_PC1_free <- summary(lm(frac_bacprod ~ PC1, 
           data = filter(metadata_pca, fraction == "Free")))
## Particle-associated
comm_lm_PC1_part <- summary(lm(frac_bacprod ~ PC1, 
           data = filter(metadata_pca, fraction == "Particle")))
## PER CAPITA: Free living
percap_lm_PC1_free <- summary(lm(log10(fracprod_per_cell_noinf) ~ PC1, 
           data = filter(metadata_pca, fraction == "Free")))
## PER CAPITA: Particle-associated
percap_lm_PC1_part <- summary(lm(log10(fracprod_per_cell_noinf) ~ PC1, 
           data = filter(metadata_pca, fraction == "Particle")))

# Put the results all together 
pca_lm_row1 <- c("Free", "Per-Liter","PC1",
          round(comm_lm_PC1_free$adj.r.squared, digits = 3),
          round(comm_lm_PC1_free$coefficients[2,4], digits = 3))
pca_lm_row2 <- c("Particle", "Per-Liter","PC1",
          round(comm_lm_PC1_part$adj.r.squared, digits = 3),
          round(comm_lm_PC1_part$coefficients[2,4], digits = 3))
pca_lm_row3 <- c("Free", "Per-Capita","PC1",
          round(percap_lm_PC1_free$adj.r.squared, digits = 3),
          round(percap_lm_PC1_free$coefficients[2,4], digits = 3))
pca_lm_row4 <- c("Particle", "Per-Capita", "PC1",
          round(percap_lm_PC1_part$adj.r.squared, digits = 3),
          round(percap_lm_PC1_part$coefficients[2,4], digits = 3))

pca_lm_results_df <- 
  rbind(pca_lm_row1, pca_lm_row2, pca_lm_row3, pca_lm_row4)

colnames(pca_lm_results_df) <- c("fraction", "Predictor","Prod_measure", "Adjusted_R2", "p-value")

row.names(pca_lm_results_df) = NULL

pander(pca_lm_results_df,
               caption = "Single Linear regression statistics for PC1.")
Single Linear regression statistics for PC1.
fraction Predictor Prod_measure Adjusted_R2 p-value
Free Per-Liter PC1 0.06 0.221
Particle Per-Liter PC1 -0.075 0.641
Free Per-Capita PC1 0.014 0.307
Particle Per-Capita PC1 0.006 0.33
#####  SINGLE REGRESSION with PC2
## Free living
comm_lm_PC2_free <- summary(lm(frac_bacprod ~ PC2, 
           data = filter(metadata_pca, fraction == "Free")))
## Particle-associated
comm_lm_PC2_part <- summary(lm(frac_bacprod ~ PC2, 
           data = filter(metadata_pca, fraction == "Particle")))
## PER CAPITA: Free living
percap_lm_PC2_free <- summary(lm(log10(fracprod_per_cell_noinf) ~ PC2, 
           data = filter(metadata_pca, fraction == "Free")))
## PER CAPITA: Particle-associated
percap_lm_PC2_part <- summary(lm(log10(fracprod_per_cell_noinf) ~ PC2, 
           data = filter(metadata_pca, fraction == "Particle")))

# Put the results all together 
pca_lm2_row1 <- c("Free", "Per-Liter","PC2",
          round(comm_lm_PC2_free$adj.r.squared, digits = 3),
          round(comm_lm_PC2_free$coefficients[2,4], digits = 3))
pca_lm2_row2 <- c("Particle", "Per-Liter","PC2",
          round(comm_lm_PC2_part$adj.r.squared, digits = 3),
          round(comm_lm_PC2_part$coefficients[2,4], digits = 3))
pca_lm2_row3 <- c("Free", "Per-Capita","PC2",
          round(percap_lm_PC2_free$adj.r.squared, digits = 3),
          round(percap_lm_PC2_free$coefficients[2,4], digits = 3))
pca_lm2_row4 <- c("Particle", "Per-Capita", "PC2",
          round(percap_lm_PC2_part$adj.r.squared, digits = 3),
          round(percap_lm_PC2_part$coefficients[2,4], digits = 3))

pca_lm2_results_df <- 
  rbind(pca_lm2_row1, pca_lm2_row2, pca_lm2_row3, pca_lm2_row4)

colnames(pca_lm2_results_df) <- c("fraction", "Predictor","Prod_measure", "Adjusted_R2", "p-value")

row.names(pca_lm2_results_df) = NULL

pander(pca_lm2_results_df,
               caption = "Single Linear regression statistics for PC2.")
Single Linear regression statistics for PC2.
fraction Predictor Prod_measure Adjusted_R2 p-value
Free Per-Liter PC2 -0.056 0.531
Particle Per-Liter PC2 0.207 0.077
Free Per-Capita PC2 0.033 0.268
Particle Per-Capita PC2 0.393 0.023
#####  MULTIPLE REGRESSION 
## Free living
summary(lm(frac_bacprod ~ PC1 + PC2, 
           data = filter(metadata_pca, fraction == "Free")))
## 
## Call:
## lm(formula = frac_bacprod ~ PC1 + PC2, data = filter(metadata_pca, 
##     fraction == "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.792 -10.435  -7.571  12.769  30.622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   24.058      5.049   4.765  0.00102 **
## PC1            6.185      4.880   1.268  0.23676   
## PC2           -3.262      4.880  -0.669  0.52058   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.49 on 9 degrees of freedom
## Multiple R-squared:  0.1858, Adjusted R-squared:  0.004856 
## F-statistic: 1.027 on 2 and 9 DF,  p-value: 0.3966
## Particle-associated
summary(lm(frac_bacprod ~ PC1 + PC2, 
           data = filter(metadata_pca, fraction == "Particle")))
## 
## Call:
## lm(formula = frac_bacprod ~ PC1 + PC2, data = filter(metadata_pca, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0696  -4.7286  -0.1293   3.1707  15.5457 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    9.958      2.198   4.531  0.00142 **
## PC1            1.147      2.124   0.540  0.60246   
## PC2           -4.032      2.124  -1.898  0.09014 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.613 on 9 degrees of freedom
## Multiple R-squared:  0.302,  Adjusted R-squared:  0.1469 
## F-statistic: 1.947 on 2 and 9 DF,  p-value: 0.1983
## PER CAPITA: Free living
summary(lm(log10(fracprod_per_cell_noinf) ~ PC1 + PC2, 
           data = filter(metadata_pca, fraction == "Free")))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ PC1 + PC2, data = filter(metadata_pca, 
##     fraction == "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5522 -0.2055 -0.1250  0.2276  0.5222 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.5750     0.1109 -68.318 1.56e-13 ***
## PC1           0.1176     0.1072   1.097    0.301    
## PC2          -0.1269     0.1072  -1.184    0.267    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3841 on 9 degrees of freedom
## Multiple R-squared:  0.2245, Adjusted R-squared:  0.05219 
## F-statistic: 1.303 on 2 and 9 DF,  p-value: 0.3185
## PER CAPITA: Particle-associated
summary(lm(log10(fracprod_per_cell_noinf) ~ PC1 + PC2, 
           data = filter(metadata_pca, fraction == "Particle")))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ PC1 + PC2, data = filter(metadata_pca, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52230 -0.13916 -0.02677  0.17348  0.63461 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.6647     0.1112 -59.946 6.66e-12 ***
## PC1           0.2069     0.1081   1.913   0.0921 .  
## PC2          -0.3653     0.1096  -3.332   0.0103 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3643 on 8 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6253, Adjusted R-squared:  0.5317 
## F-statistic: 6.676 on 2 and 8 DF,  p-value: 0.01971
PC1_lm_plot <- 
  ggplot(metadata_pca, aes(x = PC1, y = log10(fracprod_per_cell_noinf), 
                         color = fraction,  fill = fraction)) +
  geom_point() + ylab("log10(cellular production)") + 
  scale_color_manual(values = fraction_colors) + 
  scale_fill_manual(values = fraction_colors) + 
  geom_smooth(method = "lm") +
  theme(legend.title = element_blank(), legend.position = "bottom")


PC2_lm_plot <- 
  ggplot(metadata_pca, aes(x = PC2, y = log10(fracprod_per_cell_noinf), 
                         color = fraction,  fill = fraction)) +
  geom_point() + ylab("log10(cellular production)") + 
  scale_color_manual(values = fraction_colors) + 
  scale_fill_manual(values = fraction_colors) + 
  geom_smooth(method = "lm") +
  theme(legend.title = element_blank(), legend.position = "bottom")  

plot_grid(PC2_lm_plot, PC2_lm_plot, 
          labels = c("A", "B"),
          nrow = 1, ncol = 2)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

par(oma(0.1, 0.1, 0.1, 0.1))
## Error in oma(0.1, 0.1, 0.1, 0.1): could not find function "oma"
biplot(pca_environ,
     xlab = paste("PC1", paste(round(summary(pca_environ)$cont$importance[2,1]*100, digits = 2), "%", sep = ""), sep = ": "),
     ylab = paste("PC2", paste(round(summary(pca_environ)$cont$importance[2,2]*100, digits = 2), "%", sep = ""), sep = ": "))

Figure 1

######################################################### Fraction ABUNDANCe 
frac_abund_wilcox <- wilcox.test(log10(as.numeric(fraction_bac_abund)) ~ fraction, data = metadata)
frac_abund_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  log10(as.numeric(fraction_bac_abund)) by fraction
## W = 0, p-value = 1.479e-06
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
  group_by(fraction) %>%
  summarize(mean(as.numeric(fraction_bac_abund)))
## # A tibble: 2 x 2
##   fraction `mean(as.numeric(fraction_bac_abund))`
##     <fctr>                                  <dbl>
## 1 Particle                               41168.88
## 2     Free                              734522.25
# Make a data frame to draw significance line in boxplot (visually calculated)
dat1 <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(6.45,6.5,6.5,6.45)) # WholePart vs WholeFree

poster_a <- ggplot(filter(metadata, norep_filter_name != "MOTEJ515"), 
       aes(y = log10(as.numeric(fraction_bac_abund)), x = fraction)) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  ylab("Log10(Bacterial Counts) \n (cells/mL)") +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  ##### Particle vs free cell abundances 
  geom_path(data = dat1, aes(x = a, y = b), linetype = 1, color = "gray40") +
  annotate("text", x=1.5, y=6.5, fontface = "bold",  size = 3.5, color = "gray40",
           label= paste("***\np =", round(frac_abund_wilcox$p.value, digits = 6))) +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        axis.title.x = element_blank())



######################################################### TOTAL PRODUCTION 
totprod_wilcox <- wilcox.test(frac_bacprod ~ fraction, data = metadata)
totprod_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  frac_bacprod by fraction
## W = 33, p-value = 0.02418
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
  group_by(fraction) %>%
  summarize(mean(frac_bacprod))
## # A tibble: 2 x 2
##   fraction `mean(frac_bacprod)`
##     <fctr>                <dbl>
## 1 Particle             9.958333
## 2     Free            24.058333
# Make a data frame to draw significance line in boxplot (visually calculated)
dat2 <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(67,68,68,67)) # WholePart vs WholeFree


poster_b <- ggplot(metadata, aes(y = frac_bacprod, x = fraction)) + 
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  scale_fill_manual(values = fraction_colors, guide = FALSE) +
  scale_shape_manual(values = season_shapes) +
  ylab("Community Production \n (μgC/L/hr)") +
  ##### Particle vs free bulk production  
  geom_path(data = dat2, aes(x = a, y = b), linetype = 1, color = "gray40") +
  annotate("text", x=1.5, y=68, fontface = "bold",  size = 3.5, color = "gray40",
           label= paste("**\np =", round(totprod_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none",
        axis.title.x = element_blank())



######################################################### TOTAL PRODUCTION 
percellprod_wilcox <- wilcox.test(log10(fracprod_per_cell) ~ fraction, data = filter(metadata, norep_filter_name != "MOTEJ515"))
percellprod_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  log10(fracprod_per_cell) by fraction
## W = 125, p-value = 6.656e-05
## alternative hypothesis: true location shift is not equal to 0
filter(metadata, norep_filter_name != "MOTEJ515") %>%
  group_by(fraction) %>%
  summarize(mean(fracprod_per_cell))
## # A tibble: 2 x 2
##   fraction `mean(fracprod_per_cell)`
##     <fctr>                     <dbl>
## 1 Particle              4.816116e-07
## 2     Free              3.866798e-08
# Make a data frame to draw significance line in boxplot (visually calculated)
dat3 <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(-5.05,-5,-5,-5.05)) # WholePart vs WholeFree


poster_c <- ggplot(filter(metadata, norep_filter_name != "MOTEJ515"), 
       aes(y = log10(fracprod_per_cell), x = fraction)) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  scale_fill_manual(values = fraction_colors, guide = FALSE) +
  scale_shape_manual(values = season_shapes) +
  ylim(c(-8.5, -5)) + 
  ylab("Log10(Cellular Production) \n(μgC/cell/hr)") +
  ##### Particle vs free per-cell production 
  geom_path(data = dat3, aes(x = a, y = b), linetype = 1, color = "gray40") +
  annotate("text", x=1.5, y=-5, fontface = "bold",  size = 3.5, color = "gray40",
           label= paste("***\np =", round(percellprod_wilcox$p.value, digits = 5))) +
  theme(legend.position = "none",
        axis.title.x = element_blank())



######## FIGURE 1
# legend
legend <- get_legend(poster_a)
## Error in f(...): object 'season_shapes' not found
row1_plots <- plot_grid(poster_a + theme(legend.position = "none"), poster_b, poster_c,
          labels = c("A", "B", "C"),
          ncol = 3, nrow = 1)
## Error in f(...): object 'season_shapes' not found
#fig_1 <- 
  plot_grid(row1_plots, legend,
                   ncol = 1, nrow = 2, 
                   rel_heights = c(1, 0.05))
## Error in plot_grid(row1_plots, legend, ncol = 1, nrow = 2, rel_heights = c(1, : object 'row1_plots' not found

Figure S1

work_df <- metadata %>%
  dplyr::select(norep_filter_name, fraction, fraction_bac_abund, frac_bacprod, fracprod_per_cell_noinf) %>%
  mutate(norep_water_name = paste(substr(norep_filter_name, 1,4), substr(norep_filter_name, 6,9), sep = "")) %>%
  dplyr::select(-norep_filter_name)

part_work_df <- work_df %>%
  filter(fraction == "Particle") %>%
  rename(part_bacabund = fraction_bac_abund,
         part_prod = frac_bacprod, 
         part_percell_prod = fracprod_per_cell_noinf) %>%
  dplyr::select(-fraction)

free_work_df <- work_df %>%
  filter(fraction == "Free") %>%
  rename(free_bacabund = fraction_bac_abund,
         free_prod = frac_bacprod, 
         free_percell_prod = fracprod_per_cell_noinf) %>%
  dplyr::select(-fraction)

byfrac_df <- part_work_df %>%
  left_join(free_work_df, by = "norep_water_name")

byfrac_df$season <- substr(byfrac_df$norep_water_name, 5,5) # 7th letter = month sampled
byfrac_df$season <- as.character(byfrac_df$season)
byfrac_df$season <- ifelse(byfrac_df$season == "5", "Spring", 
                             ifelse(byfrac_df$season == "7", "Summer", 
                                    ifelse(byfrac_df$season == "9", "Fall",
                                           "NA")))
byfrac_df$season <- factor(byfrac_df$season, levels = c("Spring", "Summer", "Fall"))

summary(lm(log10(part_bacabund) ~ log10(free_bacabund), data = filter(byfrac_df, norep_water_name != "MOTE515")))
## 
## Call:
## lm(formula = log10(part_bacabund) ~ log10(free_bacabund), data = filter(byfrac_df, 
##     norep_water_name != "MOTE515"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52690 -0.08048  0.05903  0.19565  0.35255 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)            2.0900     3.2247   0.648    0.533
## log10(free_bacabund)   0.4264     0.5532   0.771    0.461
## 
## Residual standard error: 0.3111 on 9 degrees of freedom
## Multiple R-squared:  0.06194,    Adjusted R-squared:  -0.04229 
## F-statistic: 0.5943 on 1 and 9 DF,  p-value: 0.4605
plot1 <- ggplot(filter(byfrac_df, norep_water_name != "MOTE515"), 
       aes(x = log10(free_bacabund), y = log10(part_bacabund))) +
  xlab("Free") +  ylab("Particle") + 
  ggtitle("Log10(Bacterial Counts) \n (cells/mL)") +
  geom_point(size = 3, fill = "grey", aes(shape = season), width = 0.2) + 
  scale_shape_manual(values = season_shapes) +
  theme(legend.position = "bottom",
        legend.title = element_blank())
## Warning: Ignoring unknown parameters: width
lm_prod_corr <- lm(part_prod ~ free_prod, data = byfrac_df)

plot2 <- ggplot(byfrac_df, aes(x = free_prod, y = part_prod)) +
  xlab("Free") +  ylab("Particle") + 
  ggtitle("Community Production \n(μgC/L/hr)") +
  geom_point(size = 3, fill = "grey", aes(shape = season), width = 0.2) + 
  scale_shape_manual(values = season_shapes) +
  geom_smooth(method = "lm", color = "black")  + 
  annotate("text", x =20, y= 30, color = "black", fontface = "bold", size = 4,
       label = paste("R2 =", round(summary(lm_prod_corr)$adj.r.squared, digits = 3), "\n", 
                     "p =", round(unname(summary(lm_prod_corr)$coefficients[,4][2]), digits = 3))) +
  theme(legend.position = "none")
## Warning: Ignoring unknown parameters: width
lm_percell_corr <- lm(log10(part_percell_prod) ~ log10(free_percell_prod), data = byfrac_df)

plot3 <- ggplot(byfrac_df,
       aes(x = log10(free_percell_prod), y = log10(part_percell_prod))) +  
  xlab("Free") +  ylab("Particle") + 
  ggtitle("Log10(Cellular Production) \n (μgC/cell/hr)") +
  geom_point(size = 3, fill = "grey", aes(shape = season), width = 0.2) + 
  scale_shape_manual(values = season_shapes) +
  geom_smooth(method = "lm", color = "black") + 
  annotate("text", y = -5.8, x=-7.9, color = "black", fontface = "bold", size = 4,
       label = paste("R2 =", round(summary(lm_percell_corr)$adj.r.squared, digits = 3), "\n", 
                     "p =", round(unname(summary(lm_percell_corr)$coefficients[,4][2]), digits = 3))) +
  theme(legend.position = "none")
## Warning: Ignoring unknown parameters: width
legend_s1 <- get_legend(plot1)
## Error in f(...): object 'season_shapes' not found
top_row_S1 <- plot_grid(plot1 +theme(legend.position = "none"), plot2, plot3, 
          nrow = 1, ncol = 3, 
          labels = c("A", "B", "C"), 
          align = "h")
## Warning in f(...): restarting interrupted promise evaluation
## Warning in f(...): restarting interrupted promise evaluation
## Error in f(...): object 'season_shapes' not found
plot_grid(top_row_S1, legend_s1,
          rel_heights = c(1, 0.05),
          nrow = 2, ncol = 1)
## Error in plot_grid(top_row_S1, legend_s1, rel_heights = c(1, 0.05), nrow = 2, : object 'top_row_S1' not found

Calculate Community Diversity

# Set the seed for reproducibility
set.seed(777)

# Calculate the alpha diversity with 100 repsampling events
alpha_calc <- calc_alpha_diversity(physeq = surface_PAFL_otu_pruned_notree_rm2)

# What was the minimum sample size? 
min(sample_sums(surface_PAFL_otu_pruned_notree_rm2)) - 1
## [1] 6489
# Put it altogether in a dataframe 
otu_alphadiv <- calc_mean_alphadiv(physeq = surface_PAFL_otu_pruned_notree_rm2,
                           richness_df = alpha_calc$Richness, 
                           evenness_df = alpha_calc$Inverse_Simpson, 
                           shannon_df = alpha_calc$Shannon) %>%
  mutate(fraction = factor(fraction, levels = c("Particle","Free")),
         lakesite = factor(lakesite,  levels = c("Outlet", "Deep", "Bear", "River")),
         measure = factor(measure, levels = c("Richness",  "Shannon_Entropy", "Inverse_Simpson", "Simpsons_Evenness")),
         norep_water_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = ""))

otu_alphadiv <- otu_alphadiv %>%
  left_join(pca_scores_df, by = "norep_water_name")

Correlations between Diversity Measures

##########################################################################
###########################   CORRELATIONS   #############################
##########################################################################
# RICHNESS vs SHANNON
cor(filter(otu_alphadiv, measure == "Richness" & fraction == "Particle")$mean,
    filter(otu_alphadiv, measure == "Shannon_Entropy" & fraction == "Particle")$mean) # YES
## [1] 0.9406491
# SHANNON VS INVERSE SIMPSON
cor(filter(otu_alphadiv, measure == "Shannon_Entropy" & fraction == "Particle")$mean,
    filter(otu_alphadiv, measure == "Inverse_Simpson" &  fraction == "Particle")$mean) # YES
## [1] 0.9651242
# INVERSE SIMPSON VS SIMPSONS EVENNESS
cor(filter(otu_alphadiv, measure == "Inverse_Simpson" & fraction == "Particle")$mean,
    filter(otu_alphadiv, measure == "Simpsons_Evenness" & fraction == "Particle")$mean) # YES
## [1] 0.9145095
# SIMPSONS EVENNESS VS RICHNESS
cor(filter(otu_alphadiv, measure == "Simpsons_Evenness" & fraction == "Particle")$mean,
    filter(otu_alphadiv, measure == "Richness" &  fraction == "Particle")$mean) # YES
## [1] 0.6881205
ggplot(otu_alphadiv, aes(y = mean, x = fraction)) +
  ylab("Mean Diversity Value") +
  facet_wrap(~measure, scale = "free_y", ncol = 4) +
  geom_boxplot(alpha = 0.5, outlier.shape = NA, color = "black", aes(fill = fraction)) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), color = "black") +
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = season_shapes) + 
  scale_color_manual(values = fraction_colors) + 
  theme(legend.position = "none", axis.title.x = element_blank()) 

Linear Regression Statistics

#################################### Subset Dataframes 
richness <- filter(otu_alphadiv, measure == "Richness")
shannon <- filter(otu_alphadiv, measure == "Shannon_Entropy")
invsimps <- filter(otu_alphadiv, measure == "Inverse_Simpson")
simpseven <- filter(otu_alphadiv, measure == "Simpsons_Evenness")

Linear Regressions with Fraction Diversity

#################################### Bulk Measure Production #################################### 
################### Richness ################### 
summary(lm(frac_bacprod ~ mean, data = richness))  # All samples together
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = richness)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.057 -12.017  -5.654   9.256  46.605 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 18.501168   9.039579   2.047   0.0528 .
## mean        -0.003335   0.018911  -0.176   0.8616  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.54 on 22 degrees of freedom
## Multiple R-squared:  0.001412,   Adjusted R-squared:  -0.04398 
## F-statistic: 0.0311 on 1 and 22 DF,  p-value: 0.8616
# Particle-Associated 
lm_prod_vs_rich_PA <- lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle"))
summary(lm_prod_vs_rich_PA)
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.1271 -2.8865 -0.4649  3.7537  9.8401 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -11.269531   5.717450  -1.971  0.07701 . 
## mean          0.038125   0.009868   3.863  0.00314 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.476 on 10 degrees of freedom
## Multiple R-squared:  0.5988, Adjusted R-squared:  0.5587 
## F-statistic: 14.93 on 1 and 10 DF,  p-value: 0.003143
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_rich_PA <- train(
      frac_bacprod ~ mean, data = filter(richness, fraction == "Particle"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))

cv_lm_prod_vs_rich_PA$results   # Particle-Associated CV results 
##   intercept     RMSE  Rsquared   RMSESD RsquaredSD
## 1      TRUE 6.722233 0.6674307 2.204547  0.2884269
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Free"))) # Free Living 
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction == 
##     "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.094 -11.112  -2.755   8.611  33.308 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  5.40793   21.62006   0.250    0.808
## mean         0.05511    0.06207   0.888    0.395
## 
## Residual standard error: 17.7 on 10 degrees of freedom
## Multiple R-squared:  0.07306,    Adjusted R-squared:  -0.01963 
## F-statistic: 0.7882 on 1 and 10 DF,  p-value: 0.3955
################### Shannon Entropy ################### 
summary(lm(frac_bacprod ~ mean, data = shannon))  # All samples together
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = shannon)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.659 -11.878  -5.666   9.231  46.593 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.63373   26.71652   0.623    0.540
## mean         0.08797    6.22969   0.014    0.989
## 
## Residual standard error: 15.55 on 22 degrees of freedom
## Multiple R-squared:  9.064e-06,  Adjusted R-squared:  -0.04545 
## F-statistic: 0.0001994 on 1 and 22 DF,  p-value: 0.9889
# Particle-Associated 
lm_prod_vs_shannon_PA <- lm(frac_bacprod ~ mean, data = filter(shannon, fraction == "Particle"))
summary(lm_prod_vs_shannon_PA)
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(shannon, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9022 -2.9150 -0.5875  1.6713 12.0544 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -40.320     14.007  -2.878  0.01643 * 
## mean          11.089      3.068   3.614  0.00473 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.693 on 10 degrees of freedom
## Multiple R-squared:  0.5664, Adjusted R-squared:  0.5231 
## F-statistic: 13.06 on 1 and 10 DF,  p-value: 0.004734
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_shannon_PA <- train(
      frac_bacprod ~ mean, data = filter(shannon, fraction == "Particle"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))

cv_lm_prod_vs_shannon_PA$results   # Particle-Associated CV results 
##   intercept     RMSE  Rsquared   RMSESD RsquaredSD
## 1      TRUE 6.743204 0.6698453 2.440644   0.283921
summary(lm(frac_bacprod ~ mean, data = filter(shannon, fraction == "Free"))) # Free Living 
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(shannon, fraction == 
##     "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.004 -10.708  -3.738   6.632  37.129 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -13.37      73.87  -0.181    0.860
## mean            9.40      18.50   0.508    0.622
## 
## Residual standard error: 18.15 on 10 degrees of freedom
## Multiple R-squared:  0.02516,    Adjusted R-squared:  -0.07233 
## F-statistic: 0.2581 on 1 and 10 DF,  p-value: 0.6225
################### Inverse Simpson ################### 
summary(lm(frac_bacprod ~ mean, data = invsimps))  # All samples together
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = invsimps)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.269 -10.298  -4.916   5.866  46.452 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  12.1577     6.0225   2.019   0.0559 .
## mean          0.1629     0.1731   0.941   0.3570  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.25 on 22 degrees of freedom
## Multiple R-squared:  0.03868,    Adjusted R-squared:  -0.005019 
## F-statistic: 0.8851 on 1 and 22 DF,  p-value: 0.357
# Particle-Associated samples 
lm_prod_vs_invsimps_PA <- lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle"))
summary(lm_prod_vs_invsimps_PA)
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5589 -2.1093 -0.1969  0.8752  7.8282 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.45199    2.43866  -0.185  0.85666    
## mean         0.29344    0.05781   5.076  0.00048 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.571 on 10 degrees of freedom
## Multiple R-squared:  0.7204, Adjusted R-squared:  0.6925 
## F-statistic: 25.77 on 1 and 10 DF,  p-value: 0.0004804
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_invsimps_PA <- train(
      frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))

cv_lm_prod_vs_invsimps_PA$results       # Cross Validated PA results
##   intercept     RMSE Rsquared   RMSESD RsquaredSD
## 1      TRUE 5.257223 0.755187 2.091455  0.2576904
#Free Living Samples 
summary(lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Free")))
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction == 
##     "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.765  -9.356  -4.445   6.116  36.017 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  11.0985    16.7052   0.664    0.521
## mean          0.5379     0.6598   0.815    0.434
## 
## Residual standard error: 17.8 on 10 degrees of freedom
## Multiple R-squared:  0.06233,    Adjusted R-squared:  -0.03143 
## F-statistic: 0.6648 on 1 and 10 DF,  p-value: 0.4339
################### Simpson's Evenness ################### 
summary(lm(frac_bacprod ~ mean, data = simpseven))  # All samples together
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = simpseven)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.000  -7.163  -3.815   3.392  45.845 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  -0.8524     9.2286  -0.092   0.9272  
## mean        274.1606   134.4253   2.040   0.0536 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.26 on 22 degrees of freedom
## Multiple R-squared:  0.159,  Adjusted R-squared:  0.1208 
## F-statistic:  4.16 on 1 and 22 DF,  p-value: 0.05359
# Particle-Associated 
lm_prod_vs_simpseven_PA <- lm(frac_bacprod ~ mean, data = filter(simpseven, fraction == "Particle"))
summary(lm_prod_vs_simpseven_PA)
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(simpseven, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.973  -2.229  -1.086   1.356  12.380 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   -3.829      4.539  -0.844  0.41865   
## mean         234.057     71.238   3.286  0.00821 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.995 on 10 degrees of freedom
## Multiple R-squared:  0.5191, Adjusted R-squared:  0.471 
## F-statistic: 10.79 on 1 and 10 DF,  p-value: 0.008211
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_simpseven_PA <- train(
      frac_bacprod ~ mean, data = filter(simpseven, fraction == "Particle"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))

cv_lm_prod_vs_simpseven_PA$results   # Particle-Associated CV results 
##   intercept     RMSE  Rsquared   RMSESD RsquaredSD
## 1      TRUE 6.225165 0.6598027 3.078422  0.3241796
summary(lm(frac_bacprod ~ mean, data = filter(simpseven, fraction == "Free"))) # Free Living 
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(simpseven, fraction == 
##     "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.389 -12.029  -3.306   4.668  39.946 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    15.85      23.52   0.674    0.516
## mean          114.96     321.02   0.358    0.728
## 
## Residual standard error: 18.27 on 10 degrees of freedom
## Multiple R-squared:  0.01266,    Adjusted R-squared:  -0.08607 
## F-statistic: 0.1282 on 1 and 10 DF,  p-value: 0.7277
#################################### Per-Cell Production #################################### 
################### Richness ################### 
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = richness))  # All samples together 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = richness)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8668 -0.2124  0.1045  0.2133  0.6473 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.4591277  0.2212633 -38.231  < 2e-16 ***
## mean         0.0028988  0.0004641   6.246  3.4e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3804 on 21 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6501, Adjusted R-squared:  0.6334 
## F-statistic: 39.02 on 1 and 21 DF,  p-value: 3.395e-06
# Particle-Associated 
lm_percell_prod_vs_rich_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle"))
summary(lm_percell_prod_vs_rich_PA)
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55347 -0.21545 -0.01066  0.12536  0.58830 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.0617648  0.3717671 -21.685 4.44e-09 ***
## mean         0.0023794  0.0006348   3.748  0.00457 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3506 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6095, Adjusted R-squared:  0.5662 
## F-statistic: 14.05 on 1 and 9 DF,  p-value: 0.004567
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_percell_prod_vs_rich_PA <- train(
      log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle" & norep_filter_name != "MOTEJ515"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))

cv_lm_percell_prod_vs_rich_PA$results      # Particle-Associated CV results 
##   intercept      RMSE  Rsquared    RMSESD RsquaredSD
## 1      TRUE 0.4259432 0.6442049 0.1528492  0.3200838
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Free"))) # Free Living 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, 
##     fraction == "Free"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.71719 -0.13833  0.07155  0.23581  0.56949 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.135758   0.471253 -17.264 8.99e-09 ***
## mean         0.001657   0.001353   1.225    0.249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3859 on 10 degrees of freedom
## Multiple R-squared:  0.1304, Adjusted R-squared:  0.04344 
## F-statistic: 1.499 on 1 and 10 DF,  p-value: 0.2488
################### Shannon Entropy ################### 
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = shannon))  # All samples together 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = shannon)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.03514 -0.15042 -0.03394  0.26568  0.82794 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.9036     0.7534 -14.472 2.14e-12 ***
## mean          0.8805     0.1763   4.993 6.09e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4348 on 21 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.5428, Adjusted R-squared:  0.521 
## F-statistic: 24.93 on 1 and 21 DF,  p-value: 6.09e-05
# Particle-Associated 
lm_percell_prod_vs_shannon_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Particle"))
summary(lm_percell_prod_vs_shannon_PA)
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36686 -0.23571 -0.01330  0.03961  0.70312 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -9.8897     0.8820 -11.213 1.37e-06 ***
## mean          0.6993     0.1935   3.614  0.00562 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3584 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.5921, Adjusted R-squared:  0.5468 
## F-statistic: 13.06 on 1 and 9 DF,  p-value: 0.00562
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_percell_prod_vs_shannon_PA <- train(
      log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Particle" & norep_filter_name != "MOTEJ515"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))

cv_lm_percell_prod_vs_shannon_PA$results      # Particle-Associated CV results 
##   intercept      RMSE  Rsquared    RMSESD RsquaredSD
## 1      TRUE 0.4165133 0.6923215 0.1364644   0.282022
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Free"))) # Free Living 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, 
##     fraction == "Free"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72444 -0.17785  0.08114  0.13946  0.67366 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.8666     1.6332  -5.429 0.000289 ***
## mean          0.3243     0.4091   0.793 0.446298    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4014 on 10 degrees of freedom
## Multiple R-squared:  0.05914,    Adjusted R-squared:  -0.03495 
## F-statistic: 0.6285 on 1 and 10 DF,  p-value: 0.4463
################### Inverse Simpson ################### 
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = invsimps))  # All samples together 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = invsimps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97302 -0.28199 -0.05285  0.32003  0.99088 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.86039    0.18153 -43.301  < 2e-16 ***
## mean         0.02355    0.00525   4.485 0.000204 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4596 on 21 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.4893, Adjusted R-squared:  0.4649 
## F-statistic: 20.12 on 1 and 21 DF,  p-value: 0.0002037
# Particle-Associated 
lm_percell_prod_vs_invsimps_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle"))
summary(lm_percell_prod_vs_invsimps_PA)
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28651 -0.18384 -0.11125  0.07337  0.56444 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.360961   0.159490 -46.153 5.27e-12 ***
## mean         0.018087   0.003759   4.812 0.000958 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2969 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7201, Adjusted R-squared:  0.689 
## F-statistic: 23.15 on 1 and 9 DF,  p-value: 0.0009581
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_percell_prod_vs_invsimps_PA <- train(
      log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle" & norep_filter_name != "MOTEJ515"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))

cv_lm_percell_prod_vs_invsimps_PA$results      # Particle-Associated CV results 
##   intercept      RMSE  Rsquared    RMSESD RsquaredSD
## 1      TRUE 0.3530999 0.7312218 0.1134866  0.3167412
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Free"))) # Free Living 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, 
##     fraction == "Free"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.68269 -0.11512  0.01325  0.17742  0.63175 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.03513    0.35685 -22.517 6.72e-10 ***
## mean         0.01910    0.01409   1.355    0.205    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3803 on 10 degrees of freedom
## Multiple R-squared:  0.1551, Adjusted R-squared:  0.07064 
## F-statistic: 1.836 on 1 and 10 DF,  p-value: 0.2052
################### Simpson's Evenness ################### 
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = simpseven))  # All samples together 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = simpseven)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.06874 -0.41920  0.04553  0.34511  1.56565 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.4496     0.4116  -18.10 2.74e-14 ***
## mean          4.3470     6.0344    0.72    0.479    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6353 on 21 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.02411,    Adjusted R-squared:  -0.02236 
## F-statistic: 0.5189 on 1 and 21 DF,  p-value: 0.4792
# Particle-Associated 
lm_percell_prod_vs_simpseven_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Particle"))
summary(lm_percell_prod_vs_simpseven_PA)
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, 
##     fraction == "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4726 -0.2230 -0.1106  0.1248  0.6887 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.5941     0.2885 -26.324 7.96e-10 ***
## mean         15.1887     4.6341   3.278  0.00957 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3788 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.4935 
## F-statistic: 10.74 on 1 and 9 DF,  p-value: 0.009566
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_percell_prod_vs_simpseven_PA <- train(
      log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Particle" & norep_filter_name != "MOTEJ515"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))

cv_lm_percell_prod_vs_simpseven_PA$results      # Particle-Associated CV results 
##   intercept      RMSE  Rsquared    RMSESD RsquaredSD
## 1      TRUE 0.4449297 0.6843005 0.1423662  0.3106107
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Free"))) # Free Living 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, 
##     fraction == "Free"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63112 -0.24216  0.01388  0.14672  0.77426 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.9274     0.5202 -15.240    3e-08 ***
## mean          4.9361     7.1008   0.695    0.503    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4041 on 10 degrees of freedom
## Multiple R-squared:  0.04609,    Adjusted R-squared:  -0.0493 
## F-statistic: 0.4832 on 1 and 10 DF,  p-value: 0.5028

R2 table

########## PUT A TABLE TOGETHER 
# Per Liter Production 
perliter_row1 <- c("Richness", "Per-Liter",
          round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 3),
          round(cv_lm_prod_vs_rich_PA$results$Rsquared, digits = 3), 
          round(cv_lm_prod_vs_rich_PA$results$RsquaredSD, digits = 3))

perliter_row2 <- c("Shannon_Entropy", "Per-Liter",
          round(summary(lm_prod_vs_shannon_PA)$adj.r.squared, digits = 3),
          round(cv_lm_prod_vs_shannon_PA$results$Rsquared, digits = 3), 
          round(cv_lm_prod_vs_shannon_PA$results$RsquaredSD, digits = 3))

perliter_row3 <- c("Inverse_Simpson", "Per-Liter",
          round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 3),
          round(cv_lm_prod_vs_invsimps_PA$results$Rsquared, digits = 3), 
          round(cv_lm_prod_vs_invsimps_PA$results$RsquaredSD, digits = 3))

perliter_row4 <- c("Simpsons_Evenness","Per-Liter",
          round(summary(lm_prod_vs_simpseven_PA)$adj.r.squared, digits = 3),
          round(cv_lm_prod_vs_simpseven_PA$results$Rsquared, digits = 3), 
          round(cv_lm_prod_vs_simpseven_PA$results$RsquaredSD, digits = 3))



# Per cell production 
percell_row1 <- c("Richness", "Per-Cell",
          round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 3),
          round(cv_lm_percell_prod_vs_rich_PA$results$Rsquared, digits = 3), 
          round(cv_lm_percell_prod_vs_rich_PA$results$RsquaredSD, digits = 3))

percell_row2 <- c("Shannon_Entropy", "Per-Cell",
          round(summary(lm_percell_prod_vs_shannon_PA)$adj.r.squared, digits = 3),
          round(cv_lm_percell_prod_vs_shannon_PA$results$Rsquared, digits = 3), 
          round(cv_lm_percell_prod_vs_shannon_PA$results$RsquaredSD, digits = 3))

percell_row3 <- c("Inverse_Simpson", "Per-Cell",
          round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 3),
          round(cv_lm_percell_prod_vs_invsimps_PA$results$Rsquared, digits = 3), 
          round(cv_lm_percell_prod_vs_invsimps_PA$results$RsquaredSD, digits = 3))

percell_row4 <- c("Simpsons_Evenness", "Per-Cell",
          round(summary(lm_percell_prod_vs_simpseven_PA)$adj.r.squared, digits = 3),
          round(cv_lm_percell_prod_vs_simpseven_PA$results$Rsquared, digits = 3), 
          round(cv_lm_percell_prod_vs_simpseven_PA$results$RsquaredSD, digits = 3))

r2_table <- as.data.frame(rbind(perliter_row1, perliter_row2, perliter_row3, perliter_row4,
                                percell_row1, percell_row2, percell_row3, percell_row4))
colnames(r2_table) <- c("Diversity_Metric", "Production_Measure","Adjusted_R2","CV_R2", "CV_R2_SD")
row.names(r2_table) = NULL

pander(r2_table,
               caption = "Supplemental Table 2: \n R2 estimates for heterotrophic production vs particle-associated diversity linear regressions.")
Supplemental Table 2: R2 estimates for heterotrophic production vs particle-associated diversity linear regressions.
Diversity_Metric Production_Measure Adjusted_R2 CV_R2 CV_R2_SD
Richness Per-Liter 0.559 0.667 0.288
Shannon_Entropy Per-Liter 0.523 0.67 0.284
Inverse_Simpson Per-Liter 0.692 0.755 0.258
Simpsons_Evenness Per-Liter 0.471 0.66 0.324
Richness Per-Cell 0.566 0.644 0.32
Shannon_Entropy Per-Cell 0.547 0.692 0.282
Inverse_Simpson Per-Cell 0.689 0.731 0.317
Simpsons_Evenness Per-Cell 0.493 0.684 0.311

Prepare Figure 2

######################################################### OBSERVED RICHNESS 
rich_fraction_wilcox <- wilcox.test(mean ~ fraction, data = richness)
rich_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mean by fraction
## W = 129, p-value = 0.0004955
## alternative hypothesis: true location shift is not equal to 0
filter(richness) %>%
  group_by(fraction) %>%
  summarize(mean(mean), sd(mean))
## # A tibble: 2 x 3
##   fraction `mean(mean)` `sd(mean)`
##     <fctr>        <dbl>      <dbl>
## 1 Particle     556.7992  167.31404
## 2     Free     338.4242   85.98665
# Make a data frame to draw significance line in boxplot (visually calculated)
rich_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(920, 930, 930, 920)) # WholePart vs WholeFree

rich_distribution_plot <- 
  ggplot(richness, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, shape = 21, aes(fill = fraction), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(150,950), breaks = seq(from = 0, to =925, by = 150)) + 
  xlab("Observed Richness") + xlab("Fraction") + 
  geom_path(data = rich_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=790, fontface = "bold",  size = 4, color = "#424645",
           label= paste("***\np =", round(rich_fraction_wilcox$p.value, digits = 5))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()


# Richness vs Per-Liter Heterotrophic Production Plot 
prod_vs_rich_plot <-  
  ggplot(richness, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", aes(fill = fraction)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  ylab("Heterotrophic Production \n (μgC/L/hr)") + 
  xlab("Observed Richness") +
  geom_smooth(data=filter(richness, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(150,950), breaks = seq(from = 0, to =925, by = 150)) + 
  annotate("text", x = 790, y=45, color = "#FF6600", fontface = "bold", size = 4,
           label = paste("R2 =", round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3))) +
  theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank()) 


# Richness vs Per-cell herterotrophic production Plot 
percell_prod_vs_rich_plot <- 
  ggplot(richness, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", aes(fill = fraction)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  ylab("log10(production/cell)\n (μgC/cell/hr)") +
  xlab("Observed Richness") +
  geom_smooth(data=filter(richness, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(150,950), breaks = seq(from = 0, to =925, by = 150)) + 
  #scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) + 
  annotate("text", x = 790, y=-7.5, color = "#FF6600", fontface = "bold", size = 4,
           label = paste("R2 =", round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm_percell_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3))) + 
  #annotate("text", x = 250, y=55, color = "skyblue", fontface = "bold", label = paste("Free = NS")) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))


# All 3 richness plots together 
rich_plots <- plot_grid(rich_distribution_plot, prod_vs_rich_plot, percell_prod_vs_rich_plot,
          labels = c("A", "C", "E"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1.1),
          align = "v")
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/L/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/L/hr)' in 'mbcsToSbcs': dot substituted for <bc>
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbarh).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <bc>
######################################################### INVERSE SIMPSON
invsimps_fraction_wilcox <- wilcox.test(mean ~ fraction, data = invsimps)
invsimps_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mean by fraction
## W = 81, p-value = 0.6297
## alternative hypothesis: true location shift is not equal to 0
filter(invsimps) %>%
  group_by(fraction) %>%
  summarize(mean(mean), sd(mean))
## # A tibble: 2 x 3
##   fraction `mean(mean)` `sd(mean)`
##     <fctr>        <dbl>      <dbl>
## 1 Particle     35.47659  23.843325
## 2     Free     24.09219   8.136901
# Make a data frame to draw significance line in boxplot (visually calculated)
invsimps_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(83,85,85,83)) # WholePart vs WholeFree

invsimps_distribution_plot <- 
  ggplot(invsimps, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, shape = 21, aes(fill = fraction), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  ylab("Inverse Simpson") + xlab("Fraction") + 
  geom_path(data = invsimps_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=80, fontface = "bold",  size = 4, color = "#424645", label= "NS") +
  theme(legend.position = "none", #axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()


# INVERSE SIMPSON 
# Plot Inverse Simpson vs Per-Liter Heterotrophic Production  
prod_vs_invsimps_plot <-  
  ggplot(invsimps, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", aes(fill = fraction)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  ylab("Heterotrophic Production \n (μgC/L/hr)") + 
  xlab("Inverse Simpson") +
  geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  annotate("text", x = 70, y=45, color = "#FF6600", fontface = "bold",size = 4,
           label = paste("R2 =", round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4))) +
  theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank()) 



# Inverse Simpson vs per-cell production Plot 
percell_prod_vs_invsimps_plot <- 
  ggplot(invsimps, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", aes(fill = fraction)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  ylab("log10(production/cell)\n (μgC/cell/hr)") +
  xlab("Inverse Simpson") +
  geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  #scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) + 
  annotate("text", x = 70, y=-7.5, color = "#FF6600", fontface = "bold",size = 4,
           label = paste("R2 =", round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm_percell_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4))) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))


# Plot Inverse simpson plots altogether
invsimps_plots <- plot_grid(invsimps_distribution_plot, prod_vs_invsimps_plot, percell_prod_vs_invsimps_plot,
          labels = c("B", "D", "F"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1.1),
          align = "v")
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/L/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/L/hr)' in 'mbcsToSbcs': dot substituted for <bc>
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbarh).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <bc>

Plot Figure 2

invsimps_plots_noyaxis <- 
  plot_grid(invsimps_distribution_plot + theme(axis.title.y = element_blank(), axis.text.y = element_blank()), 
            prod_vs_invsimps_plot + theme(axis.title.y = element_blank()), 
            percell_prod_vs_invsimps_plot + theme(axis.title.y = element_blank()), 
          labels = c("B", "D", "F"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1.1),
          align = "v")
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbarh).
## Warning: Removed 1 rows containing missing values (geom_point).
plot_grid(rich_plots, invsimps_plots_noyaxis,
          ncol = 2, nrow = 1, rel_widths = c(1, 0.825),
          align = "h")

Prepare Figure S2

######################################################### SHANNON_ENTROPY
shannon_fraction_wilcox <- wilcox.test(mean ~ fraction, data = shannon)
shannon_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mean by fraction
## W = 119, p-value = 0.00556
## alternative hypothesis: true location shift is not equal to 0
filter(shannon) %>%
  group_by(fraction) %>%
  summarize(mean(mean), sd(mean))
## # A tibble: 2 × 3
##   fraction `mean(mean)` `sd(mean)`
##     <fctr>        <dbl>      <dbl>
## 1 Particle     4.534078  0.5594638
## 2     Free     3.982314  0.2958221
# Make a data frame to draw significance line in boxplot (visually calculated)
shannon_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(5.8, 5.9, 5.9, 5.8)) # WholePart vs WholeFree

shannon_distribution_plot <- 
  ggplot(shannon, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, shape = 21, aes(fill = fraction), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(3.5,6), breaks = seq(from = 3, to =6, by = 0.5)) + 
  xlab("Shannon Entropy") +
  xlab("Fraction") + 
    # Add line and pval
  geom_path(data = shannon_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=5.6, fontface = "bold",  size = 4, color = "#424645",
           label= paste("***\np =", round(shannon_fraction_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()


# Shannon vs Per-Liter Heterotrophic Production Plot 
prod_vs_shannon_plot <-  
  ggplot(shannon, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", aes(fill = fraction)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  ylab("Heterotrophic Production \n (μgC/L/hr)") + 
  scale_x_continuous(limits = c(3.5,6), breaks = seq(from = 3, to =6, by = 0.5)) + 
  xlab("Shannon Entropy") +
  geom_smooth(data=filter(shannon, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  annotate("text", x = 5.25, y=45, color = "#FF6600", fontface = "bold", size = 4,
           label = paste("R2 =", round(summary(lm_prod_vs_shannon_PA)$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm_prod_vs_shannon_PA)$coefficients[,4][2]), digits = 3))) +
  theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank()) 


# Richness vs Per-cell herterotrophic production Plot 
percell_prod_vs_shannon_plot <- 
  ggplot(shannon, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", aes(fill = fraction)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  ylab("log10(production/cell)\n (μgC/cell/hr)") +
  xlab("Shannon Entropy") +
  geom_smooth(data=filter(shannon, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(3.5,6), breaks = seq(from = 3, to =6, by = 0.5)) + 
  #scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) + 
  annotate("text", x = 5.25, y=-7.5, color = "#FF6600", fontface = "bold", size = 4,
           label = paste("R2 =", round(summary(lm_percell_prod_vs_shannon_PA)$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm_percell_prod_vs_shannon_PA)$coefficients[,4][2]), digits = 3))) + 
  #annotate("text", x = 250, y=55, color = "skyblue", fontface = "bold", label = paste("Free = NS")) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))



shannon_plots <- plot_grid(shannon_distribution_plot, prod_vs_shannon_plot, percell_prod_vs_shannon_plot,
          labels = c("A", "C", "E"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1.1),
          align = "v")
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_errorbarh).
## Warning: Removed 1 rows containing missing values (geom_errorbar).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/L/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/L/hr)' in 'mbcsToSbcs': dot substituted for <bc>
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_errorbarh).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <bc>
######################################################### INVERSE SIMPSON
simpseven_fraction_wilcox <- wilcox.test(mean ~ fraction, data = simpseven)
simpseven_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mean by fraction
## W = 55, p-value = 0.3474
## alternative hypothesis: true location shift is not equal to 0
filter(simpseven) %>%
  group_by(fraction) %>%
  summarize(mean(mean), sd(mean))
## # A tibble: 2 × 3
##   fraction `mean(mean)` `sd(mean)`
##     <fctr>        <dbl>      <dbl>
## 1 Particle   0.05890559 0.02537484
## 2     Free   0.07138806 0.01716014
# Make a data frame to draw significance line in boxplot (visually calculated)
simpseven_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(0.14, 0.15, 0.15, 0.14)) # WholePart vs WholeFree

simpseven_distribution_plot <- 
  ggplot(simpseven, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, shape = 21, aes(fill = fraction), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(0,0.15), breaks = seq(from = 0, to = 0.15, by = 0.03)) + 
  ylab("Simpson's Evenness") +
  xlab("Fraction") + 
  geom_path(data = simpseven_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=0.14, fontface = "bold",  size = 4, color = "#424645", label= "NS") +
  theme(legend.position = "none",
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()



# SIMPSONS EVENNESS
# Plot Inverse Simpson vs Per-Liter Heterotrophic Production  
prod_vs_simpseven_plot <-  
  ggplot(simpseven, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", aes(fill = fraction)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  ylab("Heterotrophic Production \n (μgC/L/hr)") + 
  ylab("Simpson's Evenness") +
  geom_smooth(data=filter(simpseven, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,0.15), breaks = seq(from = 0, to = 0.15, by = 0.03)) + 
  annotate("text", x = 0.14, y=15, color = "#FF6600", fontface = "bold",size = 4,
           label = paste("R2 =", round(summary(lm_prod_vs_simpseven_PA)$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm_prod_vs_simpseven_PA)$coefficients[,4][2]), digits = 3))) +
  theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank()) 




# Plot 
percell_prod_vs_simpseven_plot <- 
  ggplot(simpseven, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", aes(fill = fraction)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  ylab("log10(production/cell)\n (μgC/cell/hr)") +
  xlab("Simpson's Evenness") +
  geom_smooth(data=filter(simpseven, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,0.15), breaks = seq(from = 0, to = 0.15, by = 0.03)) + 
  annotate("text", x = 0.14, y=-6.3, color = "#FF6600", fontface = "bold",size = 4,
           label = paste("R2 =", round(summary(lm_percell_prod_vs_simpseven_PA)$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm_percell_prod_vs_simpseven_PA)$coefficients[,4][2]), digits = 4))) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))



simpseven_plots <- plot_grid(simpseven_distribution_plot, prod_vs_simpseven_plot, percell_prod_vs_simpseven_plot,
          labels = c("B", "D", "F"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1.1),
          align = "v")
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbarh).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <bc>

Plot Figure S2

simpseven_plots_noyaxis <- 
  plot_grid(simpseven_distribution_plot + theme(axis.title.y = element_blank(), axis.text.y = element_blank()), 
            prod_vs_simpseven_plot + theme(axis.title.y = element_blank()), 
            percell_prod_vs_simpseven_plot + theme(axis.title.y = element_blank()), 
          labels = c("B", "D", "F"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1.1),
          align = "v")
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbarh).
## Warning: Removed 1 rows containing missing values (geom_point).
plot_grid(shannon_plots, simpseven_plots_noyaxis,
          ncol = 2, nrow = 1, rel_widths = c(1, 0.825),
          align = "h")

lm_simpseven_bulkprod <- lm(frac_bacprod ~ mean, 
                            data = filter(otu_alphadiv, measure == "Simpsons_Evenness"))

ggplot(filter(otu_alphadiv, measure == "Simpsons_Evenness"), 
       aes(x = mean, y = frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  ylab("Heterotrophic Production \n(ug C/L/hr)") +
  xlab("Simpson's Evenness") + 
  geom_point(size = 3, shape = 21, aes(fill = fraction)) + 
  geom_smooth(method = "lm", color = "#424645", fill = "#424645", alpha = 0.3) +
  scale_fill_manual(values = fraction_colors) +
  scale_color_manual(values = fraction_colors) +
  theme(legend.position = c(0.15, 0.9), legend.title = element_blank()) +
  annotate("text", x = 0.115, y=55, color = "#424645", fontface = "bold", size = 4,
           label = paste("R2 =", round(summary(lm_simpseven_bulkprod)$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm_simpseven_bulkprod)$coefficients[,4][2]), digits = 3))) 

Prepare Figure S3

plot_all_rich_percell <- 
  ggplot(richness, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd), color = "grey", alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", fill =  "grey") +
  ylab("log10(production/cell)\n (μgC/cell/hr)") + xlab("Observed Richness") +
  geom_smooth(method='lm', color = "#424645", fill = "#424645") + 
  scale_x_continuous(limits = c(150,950), breaks = seq(from = 0, to =925, by = 150)) + 
  scale_y_continuous(limits = c(-8.5, -5.4), breaks = seq(from = -9, to = -5, by = 1)) + 
  annotate("text", x = 790, y=-8, color = "#424645", fontface = "bold", size = 4,
           label = paste("R2 =", round(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = richness))$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = richness))$coefficients[,4][2]), digits = 6))) + 
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))

plot_all_shannon_percell <- 
  ggplot(shannon, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd), color = "grey", alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", fill =  "grey") +
  ylab("log10(production/cell)\n (μgC/cell/hr)") + xlab("Shannon Entropy") +
  geom_smooth(method='lm', color = "#424645", fill = "#424645") + 
  scale_x_continuous(limits = c(3.5,6), breaks = seq(from = 3, to =6, by = 0.5)) + 
  scale_y_continuous(limits = c(-8.5, -5.4), breaks = seq(from = -9, to = -5, by = 1)) + 
  annotate("text", x = 5.25, y=-8, color = "#424645", fontface = "bold", size = 4,
           label = paste("R2 =", round(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = shannon))$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = shannon))$coefficients[,4][2]), digits = 5))) + 
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))


plot_all_invsimps_percell <- 
  ggplot(invsimps, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd), color = "grey", alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", fill =  "grey") +
  ylab("log10(production/cell)\n (μgC/cell/hr)") + xlab("Inverse Simpson") +
  geom_smooth(method='lm', color = "#424645", fill = "#424645") + 
  scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  scale_y_continuous(limits = c(-8.5, -5.4), breaks = seq(from = -9, to = -5, by = 1)) + 
  annotate("text", x = 70, y=-8, color = "#424645", fontface = "bold", size = 4,
           label = paste("R2 =", round(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = invsimps))$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = invsimps))$coefficients[,4][2]), digits = 5))) + 
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))


plot_all_simpseven_percell <- 
  ggplot(simpseven, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd), color = "grey", alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, shape = 21, color = "black", fill =  "grey") +
  ylab("log10(production/cell)\n (μgC/cell/hr)") + xlab("Inverse Simpson") +
  geom_smooth(method='lm', color = "#424645", fill = "#424645") + 
  scale_x_continuous(limits = c(0,0.15), breaks = seq(from = 0, to = 0.15, by = 0.03)) + 
  scale_y_continuous(limits = c(-8.5, -5.4), breaks = seq(from = -9, to = -5, by = 1)) + 
  annotate("text", x = 0.12, y=-8, color = "#424645", fontface = "bold", size = 4,
           label = paste("R2 =", round(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = simpseven))$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = simpseven))$coefficients[,4][2]), digits = 2))) + 
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))


plot_grid(plot_all_rich_percell, 
          plot_all_shannon_percell + theme(axis.title.y = element_blank()), 
          plot_all_invsimps_percell + theme(axis.title.y = element_blank()), 
          plot_all_simpseven_percell + theme(axis.title.y = element_blank()),
          align = "h", labels = c("A", "B", "C", "D"),
          rel_widths = c(1.05, 0.9, 0.9, 0.9),
          nrow = 1, ncol = 4)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbarh).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <bc>
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_errorbarh).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbarh).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbarh).
## Warning: Removed 1 rows containing missing values (geom_point).

Calculate the Phylogenetic Diversity

# Read in the tree
RAREFIED_rm2_fasttree <- read.tree(file = "data/PhyloTree/newick_tree_rm2_rmN.tre")
  
# Load in data that has doubletons removed 
load("data/PhyloTree/surface_PAFL_otu_pruned_RAREFIED_rm2.RData")
surface_PAFL_otu_pruned_RAREFIED_rm2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1891 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 70 sample variables ]
## tax_table()   Taxonomy Table:    [ 1891 taxa by 8 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1891 tips and 1889 internal nodes ]
# Create the OTU table for picante 
surface_PAFL_RAREFIED_rm2_otu <- matrix(otu_table(surface_PAFL_otu_pruned_RAREFIED_rm2), nrow = nrow(otu_table(surface_PAFL_otu_pruned_RAREFIED_rm2)))
rownames(surface_PAFL_RAREFIED_rm2_otu) <- sample_names(surface_PAFL_otu_pruned_RAREFIED_rm2)
colnames(surface_PAFL_RAREFIED_rm2_otu) <- taxa_names(surface_PAFL_otu_pruned_RAREFIED_rm2)
    
  
## Calculate input for SES_MPD  
# Convert the abundance data to standardized abundanced vegan function `decostand' , NOTE: method = "total"
otu_decostand_total <- decostand(surface_PAFL_RAREFIED_rm2_otu, method = "total")
# check total abundance in each sample
apply(otu_decostand_total, 1, sum)
## MBREJ515 MBREJ715 MBREJ915 MBREK515 MBREK715 MBREK915 MDPEJ515 MDPEJ715 MDPEJ915 MDPEK515 MDPEK715 MDPEK915 MINEJ515 MINEJ715 MINEJ915 MINEK515 MINEK715 MINEK915 MOTEJ515 MOTEJ715 MOTEJ915 MOTEK515 MOTEK715 MOTEK915 
##        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1
# check for mismatches/missing species between community data and phylo tree
RAREFIED_rm2_matches <- match.phylo.comm(RAREFIED_rm2_fasttree, otu_decostand_total)
# the resulting object is a list with $phy and $comm elements.  replace our
# original data with the sorted/matched data
phy_RAREFIED_rm2 <- RAREFIED_rm2_matches$phy
comm_RAREFIED_rm2 <- RAREFIED_rm2_matches$comm

# Calculate the phylogenetic distances
phy_dist_RAREFIED_rm2 <- cophenetic(phy_RAREFIED_rm2)

  
## Calculate SES_MPD
###################################### INDEPENDENT SWAP ############################################
# calculate standardized effect size mean pairwise distance (ses.mpd)
unweighted_sesMPD_indepswap_RAREFIED_rm2 <- ses.mpd(comm_RAREFIED_rm2, phy_dist_RAREFIED_rm2, null.model = "independentswap", 
                                     abundance.weighted = FALSE, runs = 999)

WEIGHTED_sesMPD_indepswap_RAREFIED_rm2 <- ses.mpd(comm_RAREFIED_rm2, phy_dist_RAREFIED_rm2, null.model = "independentswap", 
                                     abundance.weighted = TRUE, runs = 999)


# Gather div info
rich_df <- filter(otu_alphadiv, measure == "Richness") %>%
  dplyr::select(norep_filter_name, mean, sd, frac_bacprod, SD_frac_bacprod, fracprod_per_cell_noinf)

invsimps_df <- filter(otu_alphadiv, measure == "Inverse_Simpson") %>%
  dplyr::select(norep_filter_name, mean, sd, frac_bacprod, SD_frac_bacprod, fracprod_per_cell_noinf)


# Prepare to be merged with each other 
unweighted_df <- unweighted_sesMPD_indepswap_RAREFIED_rm2 %>%
  rownames_to_column("names") %>%
  mutate(phylo_measure = "Unweighted_SESMPD") %>%
  make_metadata_norep() %>%
  mutate(fraction = fct_recode(fraction, Particle = "WholePart", Free = "WholeFree")) %>%
  rename(norep_filter_name = names) %>%
  left_join(rich_df, by = "norep_filter_name") %>%
  mutate(lakesite = factor(lakesite, levels = c("MOT", "MDP","MBR", "MIN")),
         lakesite = fct_recode(lakesite, Outlet = "MOT", Deep = "MDP", Bear = "MBR", River = "MIN"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining factor and character vector, coercing into character vector
weighted_df <- WEIGHTED_sesMPD_indepswap_RAREFIED_rm2 %>%
  rownames_to_column("names") %>%
  mutate(phylo_measure = "Weighted_SESMPD") %>%
  make_metadata_norep() %>%
  mutate(fraction = fct_recode(fraction, Particle = "WholePart", Free = "WholeFree")) %>%
  rename(norep_filter_name = names) %>%
  left_join(invsimps_df, by = "norep_filter_name") %>%
  mutate(lakesite = factor(lakesite, levels = c("MOT", "MDP","MBR", "MIN")),
         lakesite = fct_recode(lakesite, Outlet = "MOT", Deep = "MDP", Bear = "MBR", River = "MIN"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining factor and character vector, coercing into character vector

Prepare the data for Ridge & Lasso Regressions

unweight_vals <- unweighted_df %>%
  dplyr::select(mpd.obs.z, norep_filter_name) %>%
  rename(unweighted_div = mpd.obs.z)
  
weighted_vals <- weighted_df %>%
  dplyr::select(mpd.obs.z, norep_filter_name) %>%
  rename(weighted_div = mpd.obs.z)


wide_all_divs <- otu_alphadiv %>%
  dplyr::select(norep_filter_name ,mean, measure) %>%
  spread(measure, mean)

lasso_data_df <- metadata_pca %>%
  left_join(wide_all_divs, by = "norep_filter_name") %>%
  left_join(unweight_vals, by = "norep_filter_name") %>%
  left_join(weighted_vals, by = "norep_filter_name") %>%
  dplyr::select(-c(project, year, Date, limnion, norep_water_name, dnaconcrep1, 
                   SD_perc_attached_bacprod, SE_total_bac_abund, SE_perc_attached_abund, SE_attached_bac))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining factor and character vector, coercing into character vector
lasso_data_df_particle <- lasso_data_df %>%
  filter(fraction == "Particle" ) %>%
  dplyr::select(-fraction)

lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
  dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf, 
                   tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, 
                   lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) 
  
  # Must be numerical to go into the model 
lasso_data_df_particle_noprod <- apply(lasso_data_df_particle_noprod, 2, as.numeric) %>%
  as.data.frame()

percell_lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
  dplyr::select(-c(fracprod_per_cell, frac_bacprod,
                   tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, 
                   lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
  dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )

Multiple Regression

summary(lm(frac_bacprod ~ Inverse_Simpson, data = lasso_data_df_particle)) 
## 
## Call:
## lm(formula = frac_bacprod ~ Inverse_Simpson, data = lasso_data_df_particle)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5589 -2.1093 -0.1969  0.8752  7.8282 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.45199    2.43866  -0.185  0.85666    
## Inverse_Simpson  0.29344    0.05781   5.076  0.00048 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.571 on 10 degrees of freedom
## Multiple R-squared:  0.7204, Adjusted R-squared:  0.6925 
## F-statistic: 25.77 on 1 and 10 DF,  p-value: 0.0004804
summary(lm(frac_bacprod ~ PC2, data = lasso_data_df_particle)) 
## 
## Call:
## lm(formula = frac_bacprod ~ PC2, data = lasso_data_df_particle)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.1121  -4.1993   0.3866   2.9143  16.5918 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.958      2.119   4.701 0.000841 ***
## PC2           -4.032      2.048  -1.969 0.077229 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.339 on 10 degrees of freedom
## Multiple R-squared:  0.2795, Adjusted R-squared:  0.2074 
## F-statistic: 3.878 on 1 and 10 DF,  p-value: 0.07723
summary(lm(log10(fracprod_per_cell_noinf) ~ Inverse_Simpson, data = lasso_data_df_particle)) 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ Inverse_Simpson, 
##     data = lasso_data_df_particle)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28651 -0.18384 -0.11125  0.07337  0.56444 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -7.360961   0.159490 -46.153 5.27e-12 ***
## Inverse_Simpson  0.018087   0.003759   4.812 0.000958 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2969 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7201, Adjusted R-squared:  0.689 
## F-statistic: 23.15 on 1 and 9 DF,  p-value: 0.0009581
summary(lm(log10(fracprod_per_cell_noinf) ~ PC2, data = lasso_data_df_particle)) 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ PC2, data = lasso_data_df_particle)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58949 -0.27765  0.04936  0.18633  0.87387 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.6883     0.1258 -53.183 1.48e-12 ***
## PC2          -0.3385     0.1237  -2.735    0.023 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4146 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.454,  Adjusted R-squared:  0.3933 
## F-statistic: 7.482 on 1 and 9 DF,  p-value: 0.02302
summary(lm(log10(fracprod_per_cell_noinf) ~ Inverse_Simpson + PC2, data = lasso_data_df_particle)) 
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ Inverse_Simpson + 
##     PC2, data = lasso_data_df_particle)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32534 -0.15648 -0.06430  0.04141  0.52396 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -7.250277   0.208543 -34.766 5.12e-10 ***
## Inverse_Simpson  0.015255   0.005087   2.999   0.0171 *  
## PC2             -0.101149   0.119891  -0.844   0.4234    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3017 on 8 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7429, Adjusted R-squared:  0.6787 
## F-statistic: 11.56 on 2 and 8 DF,  p-value: 0.004366

Perform Ridge and Lasso regression

Bulk Community Production

set.seed(777)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(frac_bacprod ~ ., lasso_data_df_particle_noprod)[,-1]
y = lasso_data_df_particle_noprod$frac_bacprod
grid = 10^seq(10,-2,length = 100)

# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]

################ RIDGE
# Run RIDGE regression with alpha = 0
ridge_divs_train <- glmnet(x[train,], y[train], alpha = 0, lambda = grid, thresh = 1e-12, standardize = TRUE)
par(mfrow = c(1,2)) 
plot(ridge_divs_train)
legend("top")
## Error in as.graphicsAnnot(legend): argument "legend" is missing, with no default
# Cross validation
cv_ridge_divs <- cv.glmnet(x[train,], y[train], alpha = 0)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_ridge_divs)

best_ridge_lambda <- cv_ridge_divs$lambda.min
ridge_divs_pred <- predict(ridge_divs_train, s = best_ridge_lambda, newx = x[test,])
mean((ridge_divs_pred - y_test)^2) # Test MSE
## [1] 76.26047
## Run ridge on the entire dataset 
ridge_divs <- glmnet(x, y, alpha = 0, lambda = grid, standardize = TRUE)
par(mfrow = c(1,1))
plot(ridge_divs)

legend("top")
## Error in as.graphicsAnnot(legend): argument "legend" is missing, with no default
################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = TRUE)
par(mfrow = c(1,2))
plot(lasso_divs_train)

# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)

best_lasso_lambda <- cv_lasso_divs$lambda.min
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 89.70601
## Run lasso on the entire dataset 
lasso_divs <- glmnet(x, y, alpha = 1, lambda = grid, standardize = TRUE)
par(mfrow = c(1,1))
plot(lasso_divs)

legend("top")
## Error in as.graphicsAnnot(legend): argument "legend" is missing, with no default
# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 35 x 1 sparse Matrix of class "dgCMatrix"
##                                1
## (Intercept)           7.98740428
## Sample_depth_m        .         
## Temp_C                .         
## SpCond_uSpercm        .         
## TDS_mgperL            .         
## pH                    .         
## ORP_mV                .         
## Chl_Lab_ugperL        .         
## Cl_mgperL             .         
## SO4_mgperL            .         
## NO3_mgperL            .         
## NH3_mgperL            .         
## TKN_mgperL            .         
## SRP_ugperL            .         
## TP_ugperL             .         
## Alk_mgperL            .         
## DO_mgperL             .         
## DO_percent            .         
## total_bac_abund       .         
## attached_bac          .         
## perc_attached_abund   .         
## perc_attached_bacprod .         
## fraction_bac_abund    .         
## PC1                   .         
## PC2                   .         
## PC3                   .         
## PC4                   .         
## PC5                   .         
## PC6                   .         
## Richness              .         
## Shannon_Entropy       .         
## Inverse_Simpson       0.05555576
## Simpsons_Evenness     .         
## unweighted_div        .         
## weighted_div          .
# What about for ridge regression?
predict(ridge_divs_pred, type = "coefficients", s = best_ridge_lambda)
## Error in UseMethod("predict"): no applicable method for 'predict' applied to an object of class "c('matrix', 'double', 'numeric')"

The test MSE for ridge regression is 76.2604659 while the test MSE for lasso is 89.7060061.

Additionally, the lasso model uses Inverse Simpson as the best and only predictor of production!

Per capita Production

percell_lasso_data_df_ALL <- lasso_data_df %>%
  dplyr::select(-c(fracprod_per_cell, frac_bacprod,
                   tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, 
                   lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
  dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
  mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
  dplyr::select(-c(fracprod_per_cell_noinf))

set.seed(777)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., percell_lasso_data_df_ALL)[,-1]
y = percell_lasso_data_df_ALL$log10_percell
grid = 10^seq(10,-2,length = 100)

# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]

################ RIDGE
# Run RIDGE regression with alpha = 0
ridge_divs_train <- glmnet(x[train,], y[train], alpha = 0, lambda = grid, thresh = 1e-12, standardize = TRUE)
par(mfrow = c(1,2)) 
plot(ridge_divs_train)

# Cross validation
cv_ridge_divs <- cv.glmnet(x[train,], y[train], alpha = 0)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_ridge_divs)

best_ridge_lambda <- cv_ridge_divs$lambda.min
ridge_divs_pred <- predict(ridge_divs_train, s = best_ridge_lambda, newx = x[test,])
mean((ridge_divs_pred - y_test)^2) # Test MSE
## [1] 0.438887
## Run ridge on the entire dataset 
ridge_divs <- glmnet(x, y, alpha = 0, lambda = grid, standardize = TRUE)
par(mfrow = c(1,1))
plot(ridge_divs)

################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = TRUE)
par(mfrow = c(1,2))
plot(lasso_divs_train)

# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)

best_lasso_lambda <- cv_lasso_divs$lambda.min
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 0.3199992
## Run lasso on the entire dataset 
lasso_divs <- glmnet(x, y, alpha = 1, lambda = grid, standardize = TRUE)
par(mfrow = c(1,1))
plot(lasso_divs)

# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 36 x 1 sparse Matrix of class "dgCMatrix"
##                                   1
## (Intercept)            2.608716e+00
## fractionFree          -3.928840e-01
## Sample_depth_m         .           
## Temp_C                 .           
## SpCond_uSpercm         .           
## TDS_mgperL             .           
## pH                    -1.204696e+00
## ORP_mV                 .           
## Chl_Lab_ugperL         .           
## Cl_mgperL              .           
## SO4_mgperL            -3.818350e-05
## NO3_mgperL             .           
## NH3_mgperL             .           
## TKN_mgperL             .           
## SRP_ugperL             .           
## TP_ugperL              .           
## Alk_mgperL             .           
## DO_mgperL              .           
## DO_percent             .           
## total_bac_abund        .           
## attached_bac          -3.387069e-07
## perc_attached_abund    .           
## perc_attached_bacprod  .           
## fraction_bac_abund     .           
## PC1                    .           
## PC2                    .           
## PC3                    .           
## PC4                    .           
## PC5                   -2.169245e-02
## PC6                    .           
## Richness               1.026611e-03
## Shannon_Entropy        .           
## Inverse_Simpson        .           
## Simpsons_Evenness      .           
## unweighted_div        -1.874834e-02
## weighted_div           .

Figure 3: Unweighted ses.mpd

######################################################### SES MPD DISTRIBUTION: UNWEIGHTED  
unweighted_fraction_wilcox <- wilcox.test(mpd.obs.z ~ fraction, data = unweighted_df)
unweighted_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mpd.obs.z by fraction
## W = 30, p-value = 0.01449
## alternative hypothesis: true location shift is not equal to 0
unweighted_df %>%
  group_by(fraction) %>%
  summarize(mean(mpd.obs.z))
## # A tibble: 2 x 2
##   fraction `mean(mpd.obs.z)`
##     <fctr>             <dbl>
## 1 Particle        -0.4970703
## 2     Free         0.4375166
# Make a data frame to draw significance line in boxplot (visually calculated)
dat4 <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(1.6,1.7,1.7,1.6)) # WholePart vs WholeFree


unweight_distribution_plot <- 
  ggplot(unweighted_df, aes(y = mpd.obs.z, x = fraction)) +
  #scale_color_manual(values = fraction_colors) + 
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, shape = 21, aes(fill = fraction), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  ylab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  xlab("Fraction") + 
  geom_hline(yintercept = 0, linetype = "dashed", size = 1.5) +
  ##### Particle vs free per-cell production 
  geom_path(data = dat4, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=1.35, fontface = "bold",  size = 4, color = "#424645",
           label= paste("***\np =", round(unweighted_fraction_wilcox$p.value, digits = 2))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()

# Is there a relationship between richness and Unweighted Mean Pairwise distance?
summary(lm(mean ~ mpd.obs.z, data = unweighted_df))
## 
## Call:
## lm(formula = mean ~ mpd.obs.z, data = unweighted_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -195.66  -96.12  -14.17   80.15  295.24 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   443.89      28.29  15.690 1.98e-13 ***
## mpd.obs.z    -124.90      34.37  -3.634  0.00147 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 138.5 on 22 degrees of freedom
## Multiple R-squared:  0.3751, Adjusted R-squared:  0.3467 
## F-statistic:  13.2 on 1 and 22 DF,  p-value: 0.001467
summary(lm(mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
## 
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -191.75 -112.16  -41.91   55.88  278.93 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   515.16      51.16  10.069 1.49e-06 ***
## mpd.obs.z     -83.76      49.91  -1.678    0.124    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 155 on 10 degrees of freedom
## Multiple R-squared:  0.2198, Adjusted R-squared:  0.1417 
## F-statistic: 2.816 on 1 and 10 DF,  p-value: 0.1242
summary(lm(mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
## 
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == 
##     "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -100.15  -66.22  -18.72   43.66  172.93 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  337.075     42.750   7.885 1.34e-05 ***
## mpd.obs.z      3.083     77.507   0.040    0.969    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 90.18 on 10 degrees of freedom
## Multiple R-squared:  0.0001582,  Adjusted R-squared:  -0.09983 
## F-statistic: 0.001583 on 1 and 10 DF,  p-value: 0.9691
unweight_rich_vs_mpd_plot <- 
  ggplot(unweighted_df, aes(y = mean, x = mpd.obs.z)) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3, shape = 21, aes(fill = fraction)) + ylab("Observed  Richness") +
  xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  geom_smooth(method = "lm", color = "#424645", fill = "#424645", alpha = 0.3) + 
  scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  annotate("text", x = 0.75, y=750, color = "#424645", fontface = "bold",
           label = paste("R2 =", round(summary(lm(mean ~ mpd.obs.z, data = unweighted_df))$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm(mean ~ mpd.obs.z, data = unweighted_df))$coefficients[,4][2]), digits = 3))) +
  theme(legend.title = element_blank(), legend.position = "none", 
        axis.text.x = element_blank(), axis.title.x = element_blank())


# Is there a relationship between Production and Unweighted Mean Pairwise distance?
#summary(lm(frac_bacprod ~ mpd.obs.z, data = unweighted_df)) # NS 

summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
## 
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, 
##     fraction == "Particle"))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.144 -4.312 -1.822  3.403 19.527 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    8.213      2.617   3.139   0.0105 *
## mpd.obs.z     -3.510      2.553  -1.375   0.1991  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.928 on 10 degrees of freedom
## Multiple R-squared:  0.159,  Adjusted R-squared:  0.07492 
## F-statistic: 1.891 on 1 and 10 DF,  p-value: 0.1991
summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
## 
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, 
##     fraction == "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.003 -14.934   2.196   8.756  36.997 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   18.183      8.396   2.166   0.0556 .
## mpd.obs.z     13.428     15.223   0.882   0.3984  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.71 on 10 degrees of freedom
## Multiple R-squared:  0.07219,    Adjusted R-squared:  -0.02059 
## F-statistic: 0.7781 on 1 and 10 DF,  p-value: 0.3984
unweight_prod_vs_mpd_plot <- 
  ggplot(unweighted_df, aes(y = frac_bacprod, x = mpd.obs.z, fill = fraction)) +
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3, shape = 21) + 
  ylab("Heterotrophic Production \n (μgC/L/hr)") +
  xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  theme(legend.title = element_blank(), legend.position = "none",
        axis.text.x = element_blank(), axis.title.x = element_blank())


# Is there a relationship between PER-CELL PRODUCTION and Unweighted Mean Pairwise distance?
unweight_vs_percell <- lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = unweighted_df)
summary(unweight_vs_percell)
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = unweighted_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7226 -0.2897  0.0040  0.1294  1.3193 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.1988     0.0999 -72.057  < 2e-16 ***
## mpd.obs.z    -0.4972     0.1205  -4.127  0.00048 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4779 on 21 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.4478, Adjusted R-squared:  0.4215 
## F-statistic: 17.03 on 1 and 21 DF,  p-value: 0.0004795
summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37972 -0.24190 -0.16526  0.04437  1.18153 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.9247     0.1711 -40.472 1.71e-11 ***
## mpd.obs.z    -0.3298     0.1627  -2.027   0.0733 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4649 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3135, Adjusted R-squared:  0.2372 
## F-statistic: 4.109 on 1 and 9 DF,  p-value: 0.07327
summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, 
##     fraction == "Free"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63981 -0.29782  0.07682  0.17625  0.76499 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.55633    0.19602  -38.55 3.29e-12 ***
## mpd.obs.z   -0.04279    0.35539   -0.12    0.907    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4135 on 10 degrees of freedom
## Multiple R-squared:  0.001447,   Adjusted R-squared:  -0.09841 
## F-statistic: 0.01449 on 1 and 10 DF,  p-value: 0.9066
unweight_percell_vs_mpd_plot <- 
  ggplot(unweighted_df, 
       aes(y = log10(fracprod_per_cell_noinf), x = mpd.obs.z)) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3, shape = 21, aes(fill = fraction)) + 
  ylab("log10(Production/cell)\n (μgC/cell/hr)") +
  xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  geom_smooth(method = "lm", color="#424645", fill = "#424645", alpha = 0.3) +
  #stat_smooth(method="lm", se=TRUE, formula= y ~ poly(x, 2, raw=TRUE), color="#424645", fill = "#424645", alpha = 0.3) +
  scale_y_continuous(limits = c(-8.5,-5), breaks = c(-8, -7, -6)) + 
  scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  annotate("text", x = 0.75, y=-6, color = "#424645", fontface = "bold",
         label = paste("R2 =", round(summary(unweight_vs_percell)$adj.r.squared, digits = 2), "\n", 
                       "p =", round(unname(summary(unweight_vs_percell)$coefficients[,4][2]), digits = 4))) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))


plot_grid(unweight_distribution_plot, unweight_rich_vs_mpd_plot, unweight_prod_vs_mpd_plot, unweight_percell_vs_mpd_plot, 
          labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4,
          rel_heights = c(0.5, 0.8, 0.8, 1.2),
          align = "v")
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/L/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/L/hr)' in 'mbcsToSbcs': dot substituted for <bc>
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <bc>

Figure S6: Weighted ses.mpd

######################################################### SES MPD DISTRIBUTION: WEIGHTED 
weighted_fraction_wilcox <- wilcox.test(mpd.obs.z ~ fraction, data = weighted_df)
weighted_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mpd.obs.z by fraction
## W = 81, p-value = 0.6297
## alternative hypothesis: true location shift is not equal to 0
filter(weighted_df) %>%
  group_by(fraction) %>%
  summarize(mean(mpd.obs.z))
## # A tibble: 2 x 2
##   fraction `mean(mpd.obs.z)`
##     <fctr>             <dbl>
## 1 Particle        -0.3607944
## 2     Free        -0.3854885
# Make a data frame to draw significance line in boxplot (visually calculated)
dat5 <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(1.6,1.7,1.7,1.6)) # WholePart vs WholeFree


weight_distribution_plot <- 
  ggplot(weighted_df, aes(y = mpd.obs.z, x = fraction)) +
  #scale_color_manual(values = fraction_colors) + 
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, shape = 21, aes(fill = fraction), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  ylab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  xlab("Fraction") + 
  geom_hline(yintercept = 0, linetype = "dashed", size = 1.5) +
  ##### Particle vs free per-cell production 
  geom_path(data = dat5, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=1.55, fontface = "bold",  size = 3.5, color = "#424645", label= "NS") +
  theme(legend.position = "none", #axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()


# Is there a relationship between inverse simpson and Weighted Mean Pairwise distance?
#summary(lm(mean ~ mpd.obs.z, data = weighted_df)) # NS
  
summary(lm(mean ~ mpd.obs.z, data = filter(weighted_df, fraction == "Particle")))
## 
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(weighted_df, fraction == 
##     "Particle"))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28.42 -19.15   0.40  12.95  40.67 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    23.58      12.61   1.870   0.0911 .
## mpd.obs.z     -32.98      29.43  -1.121   0.2886  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.57 on 10 degrees of freedom
## Multiple R-squared:  0.1116, Adjusted R-squared:  0.02276 
## F-statistic: 1.256 on 1 and 10 DF,  p-value: 0.2886
summary(lm(mean ~ mpd.obs.z, data = filter(weighted_df, fraction == "Free")))
## 
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(weighted_df, fraction == 
##     "Free"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0597  -4.3465  -0.8155   3.8832  18.4294 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   22.187      4.720   4.701  0.00084 ***
## mpd.obs.z     -4.942     10.486  -0.471  0.64753    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.441 on 10 degrees of freedom
## Multiple R-squared:  0.02173,    Adjusted R-squared:  -0.0761 
## F-statistic: 0.2221 on 1 and 10 DF,  p-value: 0.6475
weight_invsimps_vs_mpd_plot <- 
  ggplot(weighted_df, aes(y = mean, x = mpd.obs.z, fill = fraction)) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3, shape = 21) + ylab("Inverse Simpson") +
  #xlab("Standardized Effect Size \n Weighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  theme(legend.title = element_blank(), legend.position = "none",
        axis.text.x = element_blank(), axis.title.x = element_blank())




# Is there a relationship between PER-LITER PRODUCTION and WEIGHTED Mean Pairwise distance?
summary(lm(frac_bacprod ~ mpd.obs.z, data = weighted_df))
## 
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = weighted_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.030 -10.236  -2.842   5.937  38.409 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    9.008      5.664   1.590    0.126
## mpd.obs.z    -21.439     12.889  -1.663    0.110
## 
## Residual standard error: 14.66 on 22 degrees of freedom
## Multiple R-squared:  0.1117, Adjusted R-squared:  0.07133 
## F-statistic: 2.767 on 1 and 22 DF,  p-value: 0.1104
summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(weighted_df, fraction == "Particle")))
## 
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(weighted_df, 
##     fraction == "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.240  -4.575  -1.717   4.503  15.352 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    4.391      4.126   1.064    0.312
## mpd.obs.z    -15.432      9.627  -1.603    0.140
## 
## Residual standard error: 7.711 on 10 degrees of freedom
## Multiple R-squared:  0.2044, Adjusted R-squared:  0.1249 
## F-statistic: 2.569 on 1 and 10 DF,  p-value: 0.14
summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(weighted_df, fraction == "Free")))
## 
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(weighted_df, 
##     fraction == "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.790 -12.275  -3.481   7.360  30.573 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   14.697      9.683   1.518    0.160
## mpd.obs.z    -24.285     21.512  -1.129    0.285
## 
## Residual standard error: 17.32 on 10 degrees of freedom
## Multiple R-squared:  0.113,  Adjusted R-squared:  0.02434 
## F-statistic: 1.274 on 1 and 10 DF,  p-value: 0.2853
weight_prod_vs_mpd_plot <- 
  ggplot(weighted_df, aes(y = frac_bacprod, x = mpd.obs.z, fill = fraction)) +
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3, shape = 21) + 
  ylab("Heterotrophic Production \n (μgC/L/hr)") +
  xlab("Standardized Effect Size \n Weighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  theme(legend.title = element_blank(), legend.position = "none",
        axis.text.x = element_blank(), axis.title.x = element_blank())


# Is there a relationship between PER-CELL PRODUCTION and WEIGHTED Mean Pairwise distance?
summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = weighted_df))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = weighted_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.94765 -0.40616 -0.03619  0.31885  1.39101 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.5352     0.2442 -30.860   <2e-16 ***
## mpd.obs.z    -0.9519     0.5446  -1.748   0.0951 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6009 on 21 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.127,  Adjusted R-squared:  0.08544 
## F-statistic: 3.055 on 1 and 21 DF,  p-value: 0.09508
summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(weighted_df, fraction == "Particle")))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(weighted_df, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65329 -0.32632 -0.03486  0.22224  0.95307 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.0845     0.3015 -23.496 2.18e-09 ***
## mpd.obs.z    -0.9337     0.6753  -1.383      0.2    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5096 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1752, Adjusted R-squared:  0.08357 
## F-statistic: 1.912 on 1 and 9 DF,  p-value: 0.2001
lm_percell_free_mpd <- lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(weighted_df, fraction == "Free"))
summary(lm_percell_free_mpd)
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(weighted_df, 
##     fraction == "Free"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54189 -0.16159 -0.00204  0.20070  0.44618 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.9519     0.1848 -43.019 1.11e-12 ***
## mpd.obs.z    -0.9777     0.4107  -2.381   0.0386 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3306 on 10 degrees of freedom
## Multiple R-squared:  0.3617, Adjusted R-squared:  0.2979 
## F-statistic: 5.667 on 1 and 10 DF,  p-value: 0.03857
weight_percell_vs_mpd_plot <- 
  ggplot(weighted_df, 
       aes(y = log10(fracprod_per_cell_noinf), x = mpd.obs.z, fill = fraction)) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3, shape = 21) + 
  ylab("log10(Production/cell)\n (μgC/cell/hr)") +
  xlab("Standardized Effect Size \n Weighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  geom_smooth(method = "lm", data = filter(weighted_df, fraction == "Free"), color = "skyblue", fill = "skyblue", alpha = 0.3) +
  scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
    scale_y_continuous(limits = c(-8.5,-5), breaks = c(-8, -7, -6)) + 
  annotate("text", x = 0.3, y=-7.9, color = "skyblue", fontface = "bold",
     label = paste("R2 =", round(summary(lm_percell_free_mpd)$adj.r.squared, digits = 3), "\n", 
                   "p =", round(unname(summary(lm_percell_free_mpd)$coefficients[,4][2]), digits = 3))) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))


plot_grid(weight_distribution_plot, weight_invsimps_vs_mpd_plot, weight_prod_vs_mpd_plot, weight_percell_vs_mpd_plot, 
          labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4,
          rel_heights = c(0.5, 0.8, 0.8, 1.2),
          align = "v")
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/L/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/L/hr)' in 'mbcsToSbcs': dot substituted for <bc>
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font metrics unknown for Unicode character U+03bc
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): conversion failure on ' (μgC/cell/hr)' in 'mbcsToSbcs': dot substituted for <bc>

Cell counts and Unweighted SESmpd

# Combine the datasets into 
cell_nums <- otu_alphadiv %>%
  dplyr::select(norep_filter_name, fraction_bac_abund) %>%
  distinct()

unweight_cellnums <- cell_nums %>%
  left_join(unweighted_df, by = "norep_filter_name") %>%
  dplyr::filter(norep_filter_name != "MOTEJ515")
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining character vector and factor, coercing into character vector
# Is there a relationship between cell numbers and Unweighted Mean Pairwise distance?
summary(lm(log10(fraction_bac_abund) ~ mpd.obs.z, data = filter(unweight_cellnums)))
## 
## Call:
## lm(formula = log10(fraction_bac_abund) ~ mpd.obs.z, data = filter(unweight_cellnums))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4107 -0.1703  0.1661  0.3388  0.7078 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.2637     0.1119  47.041  < 2e-16 ***
## mpd.obs.z     0.5256     0.1349   3.895 0.000835 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5352 on 21 degrees of freedom
## Multiple R-squared:  0.4194, Adjusted R-squared:  0.3917 
## F-statistic: 15.17 on 1 and 21 DF,  p-value: 0.0008354
summary(lm(log10(fraction_bac_abund) ~ mpd.obs.z, data = filter(unweight_cellnums, fraction == "Particle")))
## 
## Call:
## lm(formula = log10(fraction_bac_abund) ~ mpd.obs.z, data = filter(unweight_cellnums, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.60108 -0.11615  0.06289  0.23310  0.34729 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.61321    0.11608  39.742 2.01e-11 ***
## mpd.obs.z    0.06373    0.11039   0.577    0.578    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3154 on 9 degrees of freedom
## Multiple R-squared:  0.03572,    Adjusted R-squared:  -0.07143 
## F-statistic: 0.3334 on 1 and 9 DF,  p-value: 0.5778
summary(lm(log10(fraction_bac_abund) ~ mpd.obs.z, data = filter(unweight_cellnums, fraction == "Free")))
## 
## Call:
## lm(formula = log10(fraction_bac_abund) ~ mpd.obs.z, data = filter(unweight_cellnums, 
##     fraction == "Free"))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.294515 -0.070928 -0.008011  0.116064  0.216553 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.76213    0.08022  71.829 6.67e-15 ***
## mpd.obs.z    0.16564    0.14544   1.139    0.281    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1692 on 10 degrees of freedom
## Multiple R-squared:  0.1148, Adjusted R-squared:  0.02629 
## F-statistic: 1.297 on 1 and 10 DF,  p-value: 0.2813
ggplot(unweight_cellnums, aes(y = log10(fraction_bac_abund), x = mpd.obs.z)) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3, shape = 21, aes(fill = fraction)) + 
  ylab("log10(bacterial cells/mL)") +
  xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  geom_smooth(method = "lm", color = "#424645", fill = "#424645", alpha = 0.3) + 
  scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  annotate("text", x = 0.75, y=4.25, color = "#424645", fontface = "bold",
           label = paste("R2 =", round(summary(lm(log10(fraction_bac_abund) ~ mpd.obs.z, data = unweight_cellnums))$adj.r.squared, digits = 2), "\n", 
                         "p =", round(unname(summary(lm(log10(fraction_bac_abund) ~ mpd.obs.z, data = unweight_cellnums))$coefficients[,4][2]), digits = 3))) +
  theme(legend.title = element_blank(), legend.position = c(0.12, 0.9))

Figure S7

# Test by station
part_unweighted_df <- filter(unweighted_df, fraction == "Particle")  
kruskal.test(mpd.obs.z ~ lakesite, data = filter(unweighted_df, fraction == "Particle"))
## 
##  Kruskal-Wallis rank sum test
## 
## data:  mpd.obs.z by lakesite
## Kruskal-Wallis chi-squared = 5.8718, df = 3, p-value = 0.118
unweighted_df %>%
 filter(fraction == "Particle") %>%
  group_by(lakesite) %>%
  summarize(mean(mpd.obs.z), sd(mpd.obs.z))
## # A tibble: 4 x 3
##   lakesite `mean(mpd.obs.z)` `sd(mpd.obs.z)`
##     <fctr>             <dbl>           <dbl>
## 1   Outlet         0.7950222       0.5856743
## 2     Deep        -0.9216060       0.3564753
## 3     Bear        -0.7817952       0.9425671
## 4    River        -1.0799021       0.2412376
# Lakesite 
plot_part_unweight_lakesite <- ggplot(filter(unweighted_df, fraction == "Particle"),
       aes(y = mpd.obs.z, x = lakesite)) +
  ggtitle("Particle-Associated Samples Only") + 
  scale_fill_manual(values = lakesite_colors) +
  ylab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  geom_jitter(size = 3, shape = 21, aes(fill = lakesite), width = 0.2) + 
  geom_boxplot(aes(fill = lakesite), alpha = 0.5) +
  theme(axis.title.x = element_blank(),
        legend.position = c(0.85, 0.9), legend.title = element_blank())

# Season
plot_part_unweight_season <- ggplot(filter(unweighted_df, fraction == "Particle"),
       aes(y = mpd.obs.z, x = season)) +
  ggtitle("Particle-Associated Samples Only") + 
  ylab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  scale_fill_manual(values = season_colors) +
  geom_jitter(size = 3, shape = 21, aes(fill = season), width = 0.2) + 
  geom_boxplot(aes(fill = season), alpha = 0.5) + 
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(),
        legend.position = c(0.9, 0.9),  legend.title = element_blank())

plot_grid(plot_part_unweight_lakesite, plot_part_unweight_season, 
          labels = c("A", "B"), ncol = 2, nrow = 1, align = "h")

plot_station_rich <- 
  ggplot(filter(otu_alphadiv, measure == "Richness" & fraction == "Particle"), 
              aes(x = lakesite, y = mean, fill = lakesite)) + 
  geom_jitter(size = 3, shape = 21) + 
  ylab("Observed Richness") + 
  ggtitle("Particle Fraction Only") +  
  geom_boxplot(outlier.shape = NA, color = "black", alpha = 0.5) + 
  scale_fill_manual(values = lakesite_colors) +
  theme(legend.title = element_blank(),
        legend.position = c(0.12, 0.9))
  

otu_alphadiv %>%
  filter(measure == "Richness" & fraction == "Particle") %>%
  group_by(lakesite) %>%
  summarize(mean(mean), sd(mean))
## # A tibble: 4 x 3
##   lakesite `mean(mean)` `sd(mean)`
##     <fctr>        <dbl>      <dbl>
## 1   Outlet     430.8567   81.99728
## 2     Deep     472.0833   23.48010
## 3     Bear     637.9933  242.53768
## 4    River     686.2633  135.20325
plot_station_invsimps <-
  ggplot(filter(otu_alphadiv, measure == "Inverse_Simpson" & fraction == "Particle"), 
              aes(x = lakesite, y = mean, fill = lakesite)) + 
  geom_jitter(size = 3, shape = 21) + 
  ggtitle("Particle Fraction Only") + 
  ylab("Inverse Simpson") + 
  geom_boxplot(outlier.shape = NA, color = "black", alpha = 0.5) + 
  scale_fill_manual(values = lakesite_colors) +
  theme(legend.title = element_blank(),
        legend.position = "none")


otu_alphadiv %>%
  filter(measure == "Inverse_Simpson" & fraction == "Particle") %>%
  group_by(lakesite) %>%
  summarize(mean(mean))
## # A tibble: 4 x 2
##   lakesite `mean(mean)`
##     <fctr>        <dbl>
## 1   Outlet     23.66717
## 2     Deep     23.76370
## 3     Bear     35.36997
## 4    River     59.10553
plot_grid(plot_station_rich, plot_station_invsimps, 
          labels = c("A", "B"), ncol = 2, nrow = 1, align = "h")

####  FREE LIVING
plot_station_rich_FL <- 
  ggplot(filter(otu_alphadiv, measure == "Richness" & fraction == "Free"), 
              aes(x = lakesite, y = mean, fill = lakesite)) + 
  geom_jitter(size = 3, shape = 21) + 
  ylab("Observed Richness") + 
  ggtitle("Free-Living Fraction Only") +  
  geom_boxplot(outlier.shape = NA, color = "black", alpha = 0.5) + 
  scale_fill_manual(values = lakesite_colors) +
  theme(legend.title = element_blank(),
        legend.position = c(0.12, 0.9))
  

otu_alphadiv %>%
  filter(measure == "Richness" & fraction == "Free") %>%
  group_by(lakesite) %>%
  summarize(mean(mean), sd(mean))
## # A tibble: 4 x 3
##   lakesite `mean(mean)` `sd(mean)`
##     <fctr>        <dbl>      <dbl>
## 1   Outlet     297.5467   74.24756
## 2     Deep     313.9267   58.54588
## 3     Bear     293.9033   55.85800
## 4    River     448.3200   64.10530
plot_station_invsimps_FL <-
  ggplot(filter(otu_alphadiv, measure == "Inverse_Simpson" & fraction == "Free"), 
              aes(x = lakesite, y = mean, fill = lakesite)) + 
  geom_jitter(size = 3, shape = 21) + 
  ggtitle("Free-Living Fraction Only") + 
  ylab("Inverse Simpson") + 
  geom_boxplot(outlier.shape = NA, color = "black", alpha = 0.5) + 
  scale_fill_manual(values = lakesite_colors) +
  theme(legend.title = element_blank(),
        legend.position = "none")


otu_alphadiv %>%
  filter(measure == "Inverse_Simpson" & fraction == "Free") %>%
  group_by(lakesite) %>%
  summarize(mean(mean))
## # A tibble: 4 x 2
##   lakesite `mean(mean)`
##     <fctr>        <dbl>
## 1   Outlet     23.87979
## 2     Deep     22.42163
## 3     Bear     19.06537
## 4    River     31.00198
plot_grid(plot_station_rich_FL, plot_station_invsimps_FL, 
          labels = c("A", "B"), ncol = 2, nrow = 1, align = "h")

Prepare Figure S5: Randomized Richness

#########################################################  Subset only richness data 
### These are the richness values for the fake samples 
#rich_stats <- filter(otu_alphadiv, measure == "Richness") %>%
#  dplyr::select(1:2) %>%
#  rename(mean_richness = mean) %>%
#  mutate(sample = paste("Sample_", seq(1:nrow(filter(otu_alphadiv, measure == "Richness"))), sep = ""),
#         mean_richness = matround(mean_richness))

## Pick OTUs to match these richness values 

  # List the otus from ALL samples 
#  all_otus <- taxa_names(surface_PAFL_otu_pruned_rm2)
  
  # Obtain the OTU table from the phyloseq object
#  otutab <- otu_table(surface_PAFL_otu_pruned_rm2)
  # Make all the counts to be 0
#  otutab_newvals <- apply(otutab, c(1, 2), function(x) 0)

  # Stop if things are wrong 
#  stopifnot(colnames(otutab_newvals) == all_otus)                       # Do the OTU names match?
#  stopifnot(rownames(otutab_newvals) == rich_stats$norep_filter_name)   # Do the sample names match?
  
# Make it reproducible!   
#set.seed(777)
  
#for (row in 1:nrow(rich_stats)) {
  
  # Pick the richness value 
#  rich_val <- rich_stats[row, 2]  
  
  # Sample the OTUs to represent that richness value 
#  col_index <- sample(ncol(otutab_newvals), rich_val, replace = FALSE, prob = NULL)
  
  # make all other columns 0
#  otutab_newvals[row, col_index] <- 1

#}


## Calculate the tree for those randomized samples 
# create a new phyloseq object 
#random_physeq_presab_raw <- phyloseq(otu_table(otutab_newvals, taxa_are_rows = FALSE), 
#                                 tax_table(surface_PAFL_otu_pruned_rm2), sample_data(surface_PAFL_otu_pruned_rm2))
#random_physeq_presab_raw

# Remove taxa that are 0!
#random_physeq_presab_pruned <- prune_taxa(taxa_sums(random_physeq_presab_raw) > 0, random_physeq_presab_raw) 
#random_physeq_presab_pruned

# Calculate tree 
# Obtain the OTU names that were retained in the dataset
#tax <- data.frame(tax_table(random_physeq_presab_pruned))
#vector_of_otus <- as.vector(tax$OTU)

# Write out the file for processing/fasttree
# write(vector_of_otus, file = "data/PhyloTree/randomized/random_physeq_presab_pruned_OTUnames.txt", ncolumns = 1, append = FALSE, sep = "\n")

Plot Figure S5: Randomized Richness

# Read in the tree
randomized_tree <- read.tree(file = "data/PhyloTree/randomized/newick_tree_randomized_rmN.tre")
  
  #random_physeq_presab_pruned_tree <- merge_phyloseq(random_physeq_presab_pruned, randomized_tree)
  #random_physeq_presab_pruned_tree
#save(list="random_physeq_presab_pruned_tree", file=paste0("data/PhyloTree/randomized/random_physeq_presab_pruned_tree.RData")) 

load("data/PhyloTree/randomized/random_physeq_presab_pruned_tree.RData")
random_physeq_presab_pruned_tree
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2911 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 44 sample variables ]
## tax_table()   Taxonomy Table:    [ 2911 taxa by 8 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2911 tips and 2909 internal nodes ]
# First force the OTU 
randomized_otu <- matrix(otu_table(random_physeq_presab_pruned_tree), nrow = nrow(otu_table(random_physeq_presab_pruned_tree)))
rownames(randomized_otu) <- sample_names(random_physeq_presab_pruned_tree)
colnames(randomized_otu) <- taxa_names(random_physeq_presab_pruned_tree)
    
  
## Calculate input for SES_MPD  
# Convert the presence/absence data to standardized abundanced  vegan function `decostand' , NOTE: method = "pa"
otu_decostand <- decostand(randomized_otu, method = "pa")
# check total abundance in each sample
apply(otu_decostand, 1, sum)
## MBREJ515 MBREJ715 MBREJ915 MBREK515 MBREK715 MBREK915 MDPEJ515 MDPEJ715 MDPEJ915 MDPEK515 MDPEK715 MDPEK915 MINEJ515 MINEJ715 MINEJ915 MINEK515 MINEK715 MINEK915 MOTEJ515 MOTEJ715 MOTEJ915 MOTEK515 MOTEK715 MOTEK915 
##      906      574      434      268      256      358      493      447      476      276      284      381      840      632      586      452      383      511      505      343      444      274      238      381
# check for mismatches/missing species between community data and phylo tree
randomized_matches <- match.phylo.comm(randomized_tree, otu_decostand)
# the resulting object is a list with $phy and $comm elements.  replace our
# original data with the sorted/matched data
phy_randomized_rm2 <- randomized_matches$phy
comm_randomized_rm2 <- randomized_matches$comm

# Calculate the phylogenetic distances
phy_dist_randomized_rm2 <- cophenetic(phy_randomized_rm2)

  
## Calculate SES_MPD
###################################### INDEPENDENT SWAP ############################################
# calculate standardized effect size mean pairwise distance (ses.mpd)
unweighted_sesMPD_indepswap_randomized <- ses.mpd(comm_randomized_rm2, phy_dist_randomized_rm2, null.model = "independentswap", 
                                     abundance.weighted = FALSE, runs = 999)

df <- unweighted_sesMPD_indepswap_randomized

df$names <- row.names(df)
df_2 <- make_metadata_norep(df) %>%
  mutate(fraction = fct_recode(fraction, Particle = "WholePart", Free = "WholeFree"))
  
  
# Is there a relationship between richness and Unweighted Mean Pairwise distance?
summary(lm(ntaxa ~ mpd.obs.z, data = df_2))
## 
## Call:
## lm(formula = ntaxa ~ mpd.obs.z, data = df_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -212.74 -129.44  -12.54   53.58  468.91 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   448.27      35.48  12.633 1.47e-11 ***
## mpd.obs.z      56.32      94.24   0.598    0.556    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.7 on 22 degrees of freedom
## Multiple R-squared:  0.01598,    Adjusted R-squared:  -0.02875 
## F-statistic: 0.3572 on 1 and 22 DF,  p-value: 0.5562
summary(lm(ntaxa ~ mpd.obs.z, data = filter(df_2, fraction == "Particle")))
## 
## Call:
## lm(formula = ntaxa ~ mpd.obs.z, data = filter(df_2, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -203.87 -109.15  -44.96   44.29  341.78 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   557.62      50.67  11.006 6.56e-07 ***
## mpd.obs.z     -33.22     140.62  -0.236    0.818    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 175 on 10 degrees of freedom
## Multiple R-squared:  0.00555,    Adjusted R-squared:  -0.0939 
## F-statistic: 0.05581 on 1 and 10 DF,  p-value: 0.818
summary(lm(ntaxa ~ mpd.obs.z, data = filter(df_2, fraction == "Free")))
## 
## Call:
## lm(formula = ntaxa ~ mpd.obs.z, data = filter(df_2, fraction == 
##     "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -107.76  -48.80  -17.85   56.29  159.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   342.48      24.62  13.913 7.19e-08 ***
## mpd.obs.z      74.88      62.79   1.193    0.261    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 84.48 on 10 degrees of freedom
## Multiple R-squared:  0.1245, Adjusted R-squared:  0.03699 
## F-statistic: 1.422 on 1 and 10 DF,  p-value: 0.2605
ggplot(df_2, aes(y = ntaxa, x = mpd.obs.z, fill = fraction)) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3, shape = 21) + ylab("Randomized Richness") +
  xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  scale_x_continuous(limits = c(-1,1)) + 
  theme(legend.title = element_blank(), legend.position = c(0.15, 0.9))